From 1a56c05ee9bd7112a7d382fea9cd460b4b432709 Mon Sep 17 00:00:00 2001 From: Marisha Date: Sat, 9 Nov 2024 22:30:51 -0600 Subject: [PATCH 1/7] Replaced instances of C++14 with C++17 to build pytorch Replaced deprecated torch.cuda.amp with device_type='cuda' Autoformatted with Pep8 Verious little changes to satisfy Pylance, Pep8, and Mypy Replaced deprecated installation in readme with pip version --- .gitignore | 3 +- README.md | 8 +--- dqtorch/backend.py | 12 ++--- dqtorch/dqtorch.py | 90 +++++++++++++++++++++----------------- dqtorch/quaternion_cuda.py | 81 +++++++++++++++++----------------- examples.py | 23 +++++----- setup.py | 19 ++++---- test_speed.py | 14 +++--- 8 files changed, 129 insertions(+), 121 deletions(-) diff --git a/.gitignore b/.gitignore index 7009141..007a939 100644 --- a/.gitignore +++ b/.gitignore @@ -26,4 +26,5 @@ config/*.ini .vscode/ .idea/ *.sublime-project -*.sublime-workspace \ No newline at end of file +*.sublime-workspace +.mypy_cache \ No newline at end of file diff --git a/README.md b/README.md index 9598760..fc4dd76 100644 --- a/README.md +++ b/README.md @@ -22,15 +22,11 @@ to do: ## Get Started ### Requirments -tested in Pytorch 1.12, CUDA-11.1, gcc-6.3.0. - -To do: check if it compiles with: -- [ ] Pytorch 2.0 -- [ ] CUDA 11.7 +tested in Pytorch 2.5.1+cu124, CUDA-12.4, gcc-9.2.0 ### Install ``` -python setup.py install +python -m pip install . ``` ### Test ``` diff --git a/dqtorch/backend.py b/dqtorch/backend.py index 7b0d033..8a6bc69 100644 --- a/dqtorch/backend.py +++ b/dqtorch/backend.py @@ -4,12 +4,12 @@ _src_path = os.path.dirname(os.path.abspath(__file__)) nvcc_flags = [ - '-O3', '-std=c++14', + '-O3', '-std=c++17', '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', '-U__CUDA_NO_HALF2_OPERATORS__', ] if os.name == "posix": - c_flags = ['-O3', '-std=c++14'] + c_flags = ['-O3', '-std=c++17'] elif os.name == "nt": c_flags = ['/O2', '/std:c++17'] @@ -17,7 +17,8 @@ def find_cl_path(): import glob for edition in ["Enterprise", "Professional", "BuildTools", "Community"]: - paths = sorted(glob.glob(r"C:\\Program Files (x86)\\Microsoft Visual Studio\\*\\%s\\VC\\Tools\\MSVC\\*\\bin\\Hostx64\\x64" % edition), reverse=True) + paths = sorted(glob.glob( + r"C:\\Program Files (x86)\\Microsoft Visual Studio\\*\\%s\\VC\\Tools\\MSVC\\*\\bin\\Hostx64\\x64" % edition), reverse=True) if paths: return paths[0] @@ -25,7 +26,8 @@ def find_cl_path(): if os.system("where cl.exe >nul 2>nul") != 0: cl_path = find_cl_path() if cl_path is None: - raise RuntimeError("Could not locate a supported Microsoft Visual C++ installation") + raise RuntimeError( + "Could not locate a supported Microsoft Visual C++ installation") os.environ["PATH"] += ";" + cl_path _backend = load(name='_quaternion_cuda', @@ -37,4 +39,4 @@ def find_cl_path(): ]], ) -__all__ = ['_backend'] \ No newline at end of file +__all__ = ['_backend'] diff --git a/dqtorch/dqtorch.py b/dqtorch/dqtorch.py index 3e35a55..5339e79 100644 --- a/dqtorch/dqtorch.py +++ b/dqtorch/dqtorch.py @@ -7,13 +7,11 @@ from enum import Enum, unique - Quaternion = torch.Tensor DualQuaternions = Tuple[Quaternion, Quaternion] QuaternionTranslation = Tuple[Quaternion, torch.Tensor] - ''' quaternion library from pytorch3d https://pytorch3d.readthedocs.io/en/latest/_modules/pytorch3d/transforms/rotation_conversions.html @@ -33,7 +31,7 @@ def quaternion_conjugate(q: Quaternion) -> Quaternion: # return _quaternion_conjugate_cuda(q.contiguous().view(-1,4)).view(out_shape) if q.is_cuda: out_shape = q.shape - return _quaternion_conjugate_cuda(q.contiguous().view(-1,4)).view(out_shape) + return _quaternion_conjugate_cuda(q.contiguous().view(-1, 4)).view(out_shape) # type:ignore # nopep8 else: return _quaternion_conjugate_pytorch(q) @@ -53,6 +51,8 @@ def standardize_quaternion(quaternions: Quaternion) -> Quaternion: return torch.where(quaternions[..., 0:1] < 0, -quaternions, quaternions) # @torch.jit.script + + def _quaternion_mul(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor: """ Multiply two quaternions. @@ -73,25 +73,28 @@ def _quaternion_mul(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor: oz = aw * bz + ax * by - ay * bx + az * bw return torch.stack((ow, ox, oy, oz), -1) -def _quaternion_4D_mul_3D(a:torch.Tensor, b_xyz:torch.Tensor) -> torch.Tensor: + +def _quaternion_4D_mul_3D(a: torch.Tensor, b_xyz: torch.Tensor) -> torch.Tensor: aw, ax, ay, az = torch.unbind(a, -1) bx, by, bz = torch.unbind(b_xyz, -1) ow = - ax * bx - ay * by - az * bz ox = aw * bx + ay * bz - az * by oy = aw * by - ax * bz + az * bx oz = aw * bz + ax * by - ay * bx - return torch.stack((ow, ox, oy, oz), -1) + return torch.stack((ow, ox, oy, oz), -1) + -def _quaternion_3D_mul_4D(a_xyz:torch.Tensor, b:torch.Tensor) -> torch.Tensor: +def _quaternion_3D_mul_4D(a_xyz: torch.Tensor, b: torch.Tensor) -> torch.Tensor: ax, ay, az = torch.unbind(a_xyz, -1) bw, bx, by, bz = torch.unbind(b, -1) - ow = - ax * bx - ay * by - az * bz - ox = ax * bw + ay * bz - az * by - oy = - ax * bz + ay * bw + az * bx - oz = ax * by - ay * bx + az * bw + ow = - ax * bx - ay * by - az * bz + ox = ax * bw + ay * bz - az * by + oy = - ax * bz + ay * bw + az * bx + oz = ax * by - ay * bx + az * bw return torch.stack((ow, ox, oy, oz), -1) -def _quaternion_mul_pytorch(a: torch.Tensor, b:torch.Tensor): + +def _quaternion_mul_pytorch(a: torch.Tensor, b: torch.Tensor): ''' native pytorch implementation, only used as a baseline. ''' @@ -103,14 +106,12 @@ def _quaternion_mul_pytorch(a: torch.Tensor, b:torch.Tensor): return _quaternion_4D_mul_3D(a, b) else: raise ValueError(f"Invalid input shapes.") - - def quaternion_mul(a: Quaternion, b: Quaternion) -> Quaternion: if a.is_cuda: ouput_shape = list(a.shape[:-1]) + [4] - return _quaternion_mul_cuda(a.view(-1, a.shape[-1]), b.view(-1, b.shape[-1])).view(ouput_shape) + return _quaternion_mul_cuda(a.view(-1, a.shape[-1]), b.view(-1, b.shape[-1])).view(ouput_shape) # type:ignore # nopep8 else: return _quaternion_mul_pytorch(a, b) @@ -159,7 +160,7 @@ def axis_angle_to_quaternion(axis_angle: torch.Tensor) -> torch.Tensor: def quaternion_to_matrix(quaternions: torch.Tensor) -> torch.Tensor: """ Convert rotations given as quaternions to rotation matrices. - + Args: quaternions: quaternions with real part first, @@ -183,7 +184,7 @@ def quaternion_to_matrix(quaternions: torch.Tensor) -> torch.Tensor: o2 = two_s * (ij - kr) o3 = two_s * (ik + jr) o4 = two_s * (ij + kr) - + o5 = 1 - two_s * (ii + kk) o6 = two_s * (jk - ir) o7 = two_s * (ik - jr) @@ -195,6 +196,7 @@ def quaternion_to_matrix(quaternions: torch.Tensor) -> torch.Tensor: return o.view(quaternions.shape[:-1] + (3, 3)) + def quaternion_apply(quaternion: Quaternion, point: torch.Tensor) -> torch.Tensor: """ Apply the rotation given by a quaternion to a 3D point. @@ -213,38 +215,42 @@ def quaternion_apply(quaternion: Quaternion, point: torch.Tensor) -> torch.Tenso ) return out[..., 1:].contiguous() -def quaternion_translation_apply(q:Quaternion, t:torch.Tensor, point:torch.Tensor) -> torch.Tensor: + +def quaternion_translation_apply(q: Quaternion, t: torch.Tensor, point: torch.Tensor) -> torch.Tensor: p = quaternion_apply(q, point) return p + t - -def quaternion_translation_compose(qt1:QuaternionTranslation, qt2:QuaternionTranslation) -> QuaternionTranslation: + + +def quaternion_translation_compose(qt1: QuaternionTranslation, qt2: QuaternionTranslation) -> QuaternionTranslation: qr = quaternion_mul(qt1[0], qt2[0]) t = quaternion_apply(qt1[0], qt2[1]) + qt1[1] return (qr, t) -def quaternion_translation_inverse(q:Quaternion, t:torch.Tensor) -> Tuple[Quaternion, torch.Tensor]: + +def quaternion_translation_inverse(q: Quaternion, t: torch.Tensor) -> Tuple[Quaternion, torch.Tensor]: q_inv = quaternion_conjugate(q) t_inv = quaternion_apply(q_inv, -t) return q_inv, t_inv + def quaternion_translation_to_dual_quaternion( - q:torch.Tensor, t:torch.Tensor) -> DualQuaternions: + q: torch.Tensor, t: torch.Tensor) -> DualQuaternions: ''' https://cs.gmu.edu/~jmlien/teaching/cs451/uploads/Main/dual-quaternion.pdf ''' - q_d = 0.5* quaternion_mul(t, q) + q_d = 0.5 * quaternion_mul(t, q) return (q, q_d) -def dual_quaternion_to_quaternion_translation(dq:DualQuaternions) -> DualQuaternions: +def dual_quaternion_to_quaternion_translation(dq: DualQuaternions) -> DualQuaternions: q_r = dq[0] q_d = dq[1] t = 2*quaternion_mul(q_d, quaternion_conjugate(q_r))[..., 1:] - + return q_r, t -def dual_quaternion_mul(dq1:DualQuaternions, dq2:DualQuaternions) -> DualQuaternions: +def dual_quaternion_mul(dq1: DualQuaternions, dq2: DualQuaternions) -> DualQuaternions: q_r1 = dq1[0] q_d1 = dq1[1] q_r2 = dq2[0] @@ -253,7 +259,8 @@ def dual_quaternion_mul(dq1:DualQuaternions, dq2:DualQuaternions) -> DualQuatern r_d = quaternion_mul(q_r1, q_d2) + quaternion_mul(q_d1, q_r2) return (r_r, r_d) -def dual_quaternion_apply(dq:DualQuaternions, point:torch.Tensor) -> torch.Tensor: + +def dual_quaternion_apply(dq: DualQuaternions, point: torch.Tensor) -> torch.Tensor: ''' assuming the input dual quaternion is normalized. ''' @@ -261,19 +268,19 @@ def dual_quaternion_apply(dq:DualQuaternions, point:torch.Tensor) -> torch.Tenso return quaternion_translation_apply(q, t, point) -def dual_quaternion_q_conjugate(dq:DualQuaternions) -> DualQuaternions: +def dual_quaternion_q_conjugate(dq: DualQuaternions) -> DualQuaternions: r = quaternion_conjugate(dq[0]) d = quaternion_conjugate(dq[1]) return (r, d) -def dual_quaternion_d_conjugate(dq:DualQuaternions) -> DualQuaternions: +def dual_quaternion_d_conjugate(dq: DualQuaternions) -> DualQuaternions: return (dq[0], -dq[1]) -def dual_quaternion_3rd_conjugate(dq:DualQuaternions) -> DualQuaternions: +def dual_quaternion_3rd_conjugate(dq: DualQuaternions) -> DualQuaternions: return dual_quaternion_d_conjugate( - dual_quaternion_q_conjugate(dq) ) + dual_quaternion_q_conjugate(dq)) # def dual_quaternion_inverse(dq:DualQuaternions) -> DualQuaternions: @@ -281,7 +288,8 @@ def dual_quaternion_3rd_conjugate(dq:DualQuaternions) -> DualQuaternions: dual_quaternion_inverse = dual_quaternion_q_conjugate -def dual_quaternion_rectify(dq:DualQuaternions) -> DualQuaternions: + +def dual_quaternion_rectify(dq: DualQuaternions) -> DualQuaternions: ''' input: (unit quaternion, 4D vector w') -> dual quaternion, which satisfies (r, 0.5* t r) solve: min | q - w' | s.t. w^T r = 0 @@ -291,6 +299,7 @@ def dual_quaternion_rectify(dq:DualQuaternions) -> DualQuaternions: return (q_r, q_d) + def _sqrt_positive_part(x: torch.Tensor) -> torch.Tensor: """ Returns torch.sqrt(torch.max(0, x)) @@ -336,10 +345,14 @@ def matrix_to_quaternion(matrix: torch.Tensor) -> torch.Tensor: # we produce the desired quaternion multiplied by each of r, i, j, k quat_by_rijk = torch.stack( [ - torch.stack([q_abs[..., 0] ** 2, m21 - m12, m02 - m20, m10 - m01], dim=-1), - torch.stack([m21 - m12, q_abs[..., 1] ** 2, m10 + m01, m02 + m20], dim=-1), - torch.stack([m02 - m20, m10 + m01, q_abs[..., 2] ** 2, m12 + m21], dim=-1), - torch.stack([m10 - m01, m20 + m02, m21 + m12, q_abs[..., 3] ** 2], dim=-1), + torch.stack([q_abs[..., 0] ** 2, m21 - m12, + m02 - m20, m10 - m01], dim=-1), + torch.stack([m21 - m12, q_abs[..., 1] ** 2, + m10 + m01, m02 + m20], dim=-1), + torch.stack([m02 - m20, m10 + m01, q_abs[..., 2] + ** 2, m12 + m21], dim=-1), + torch.stack([m10 - m01, m20 + m02, m21 + m12, + q_abs[..., 3] ** 2], dim=-1), ], dim=-2, ) @@ -353,9 +366,6 @@ def matrix_to_quaternion(matrix: torch.Tensor) -> torch.Tensor: # forall i; we pick the best-conditioned one (with the largest denominator) return quat_candidates[ - torch.nn.functional.one_hot(q_abs.argmax(dim=-1), num_classes=4) > 0.5, : # pyre-ignore[16] + # pyre-ignore[16] + torch.nn.functional.one_hot(q_abs.argmax(dim=-1), num_classes=4) > 0.5, : ].reshape(batch_dim + (4,)) - - - - diff --git a/dqtorch/quaternion_cuda.py b/dqtorch/quaternion_cuda.py index 4c2b24f..0f4c364 100644 --- a/dqtorch/quaternion_cuda.py +++ b/dqtorch/quaternion_cuda.py @@ -1,18 +1,16 @@ import torch from torch.autograd import Function from torch.autograd.function import once_differentiable -from torch.cuda.amp import custom_bwd, custom_fwd +from torch.amp.autocast_mode import custom_bwd, custom_fwd + +from .backend import _backend -try: - import _quaternion_cuda as _backend -except ImportError: - from .backend import _backend def _get_broadcast_meta_data(inputs_1, inputs_2): - B1 = inputs_1.shape[0] # batch size, coord dim + B1 = inputs_1.shape[0] # batch size, coord dim B2 = inputs_2.shape[0] B = max(B1, B2) - assert(B % B1 == 0 and B % B2 == 0) + assert (B % B1 == 0 and B % B2 == 0) R1 = B // B1 R2 = B // B2 D1 = inputs_1.shape[1] @@ -20,76 +18,82 @@ def _get_broadcast_meta_data(inputs_1, inputs_2): # print(B, R1, R2, D1, D2, B1, B2) return B, R1, R2, D1, D2, B1, B2 + class _Quaternion_mul_backward(Function): @staticmethod - @custom_fwd(cast_inputs=torch.half) + @custom_fwd(cast_inputs=torch.half, device_type='cuda') def forward(ctx, grad, inputs_1, inputs_2): - B, R1, R2, D1, D2, B1, B2 = _get_broadcast_meta_data(inputs_1, inputs_2) + B, R1, R2, D1, D2, B1, B2 = _get_broadcast_meta_data( + inputs_1, inputs_2) dtype, device = inputs_1.dtype, inputs_1.device grad_inputs_1 = torch.empty(B1, D1, device=device, dtype=dtype) grad_inputs_2 = torch.empty(B2, D2, device=device, dtype=dtype) - _backend.quaternion_mul_backward(grad, B, R1, R2, D1, D2, inputs_1, inputs_2, grad_inputs_1, grad_inputs_2) + _backend.quaternion_mul_backward(grad, B, R1, R2, D1, D2, inputs_1, inputs_2, grad_inputs_1, grad_inputs_2) # type: ignore # nopep8 ctx.save_for_backward(grad, inputs_1, inputs_2) - return grad_inputs_1, grad_inputs_2 + return grad_inputs_1, grad_inputs_2 @staticmethod @once_differentiable - @custom_bwd + @custom_bwd(device_type='cuda') def backward(ctx, *grad_outputs): grad_out_1, grad_out_2 = grad_outputs grad_out_1, grad_out_2 = grad_out_1.contiguous(), grad_out_2.contiguous() grad, inputs_1, inputs_2 = ctx.saved_tensors - B, R1, R2, D1, D2, B1, B2 = _get_broadcast_meta_data(inputs_1, inputs_2) + B, R1, R2, D1, D2, B1, B2 = _get_broadcast_meta_data( + inputs_1, inputs_2) dtype, device = inputs_1.dtype, inputs_1.device grad_grad = torch.empty(B, 4, device=device, dtype=dtype) grad_grad_inputs_1 = torch.empty(B1, D1, device=device, dtype=dtype) grad_grad_inputs_2 = torch.empty(B2, D2, device=device, dtype=dtype) - _backend.quaternion_mul_backward_backward(grad_out_1, grad_out_2, - B, R1, R2, D1, D2, - grad, inputs_1, inputs_2, - grad_grad, grad_grad_inputs_1, grad_grad_inputs_2) + _backend.quaternion_mul_backward_backward(grad_out_1, grad_out_2, # type: ignore + B, R1, R2, D1, D2, + grad, inputs_1, inputs_2, + grad_grad, grad_grad_inputs_1, grad_grad_inputs_2) return grad_grad, grad_grad_inputs_1, grad_grad_inputs_2 + _quaternion_mul_backward = _Quaternion_mul_backward.apply + class _Quaternion_mul(Function): @staticmethod - @custom_fwd(cast_inputs=torch.half) - def forward(ctx, inputs_1:torch.Tensor, inputs_2:torch.Tensor): + @custom_fwd(cast_inputs=torch.half, device_type='cuda') + def forward(ctx, inputs_1: torch.Tensor, inputs_2: torch.Tensor): # inputs: [B, input_dim], float in [-1, 1] # RETURN: [B, F], float # calc_grad_inputs = inputs_1.requires_grad or inputs_2.requires_grad inputs_1 = inputs_1.contiguous() inputs_2 = inputs_2.contiguous() - - B, R1, R2, D1, D2, B1, B2 = _get_broadcast_meta_data(inputs_1, inputs_2) - assert(B == B1 and B==B2) - assert(D1 == 3 or D1 == 4) - assert(D2 == 3 or D2 == 4) + + B, R1, R2, D1, D2, B1, B2 = _get_broadcast_meta_data( + inputs_1, inputs_2) + assert (B == B1 and B == B2) + assert (D1 == 3 or D1 == 4) + assert (D2 == 3 or D2 == 4) dtype = inputs_1.dtype device = inputs_1.device - - outputs = torch.empty(B, 4, dtype=dtype, device=device) + outputs = torch.empty(B, 4, dtype=dtype, device=device) - _backend.quaternion_mul_forward(inputs_1, inputs_2, outputs, B, R1, R2, D1, D2) + _backend.quaternion_mul_forward(inputs_1, inputs_2, outputs, B, R1, R2, D1, D2) # type: ignore # nopep8 ctx.save_for_backward(inputs_1, inputs_2) - + return outputs - + @staticmethod - @custom_bwd - def backward(ctx, grad): + @custom_bwd(device_type='cuda') + def backward(ctx, grad): # type: ignore # grad: [B, C * C] # if ctx.calc_grad_inputs: grad = grad.contiguous() inputs_1, inputs_2 = ctx.saved_tensors - grad_inputs_1, grad_inputs_2 = _quaternion_mul_backward(grad, inputs_1, inputs_2) + grad_inputs_1, grad_inputs_2 = _quaternion_mul_backward( + grad, inputs_1, inputs_2) # type: ignore return grad_inputs_1, grad_inputs_2 @@ -99,18 +103,17 @@ def backward(ctx, grad): class _Quaternion_conjugate(torch.autograd.Function): @staticmethod - @custom_fwd(cast_inputs=torch.half) - def forward(ctx, inputs:torch.Tensor): - B = inputs.shape[0] # batch size, coord dim + @custom_fwd(cast_inputs=torch.half, device_type='cuda') + def forward(ctx, inputs: torch.Tensor): + B = inputs.shape[0] # batch size, coord dim outputs = torch.empty_like(inputs) - _backend.quaternion_conjugate(inputs.contiguous(), B, outputs) + _backend.quaternion_conjugate(inputs.contiguous(), B, outputs) # type: ignore # nopep8 return outputs @staticmethod - @custom_bwd - def backward(ctx, grad): + @custom_bwd(device_type='cuda') + def backward(ctx, grad): # type: ignore return _Quaternion_conjugate.apply(grad) quaternion_conjugate = _Quaternion_conjugate.apply - diff --git a/examples.py b/examples.py index 0999898..7cb0a32 100644 --- a/examples.py +++ b/examples.py @@ -4,7 +4,7 @@ n_pts = 4096*128 # get a normalized quaternion from a batch of random axis angles -qr1 = dqtorch.axis_angle_to_quaternion(torch.randn(n_pts, 3).cuda()) +qr1 = dqtorch.axis_angle_to_quaternion(torch.randn(n_pts, 3).cuda()) qr2 = dqtorch.axis_angle_to_quaternion(torch.randn(n_pts, 3).cuda()) # quaternion multiplication @@ -20,17 +20,18 @@ inv_qr1 = dqtorch.quaternion_conjugate(qr1) # apply inverse transform to p2, and we should get back p1 p1_by_inv = dqtorch.quaternion_apply(inv_qr1, p2) -print((p1_by_inv-p1).abs().max()) # should be close to 0 +print((p1_by_inv-p1).abs().max()) # should be close to 0 # se3 representation by quaternion + translation -t1 = torch.randn(n_pts, 3).cuda() # create random translations +t1 = torch.randn(n_pts, 3).cuda() # create random translations # apply se3 transformation to points p2 = dqtorch.quaternion_translation_apply(qr1, t1, p1) # inverse of se3 transformation qr1_inv, t1_inv = dqtorch.quaternion_translation_inverse(qr1, t1) # compose two se3 transformation qr3, t3 = dqtorch.quaternion_translation_compose((qr1_inv, t1_inv), (qr1, t1)) -print((qr3[..., 0]-1).abs().max(), qr3[..., 1:].abs().max(), t3.abs().max()) # should be close to 0 +print((qr3[..., 0]-1).abs().max(), qr3[..., 1:].abs().max(), + t3.abs().max()) # should be close to 0 # se3 representation by dual quaternions, which is stored as a tuple of two tensors @@ -38,18 +39,14 @@ dq1_inv = dqtorch.quaternion_translation_to_dual_quaternion(qr1_inv, t1_inv) # compose two se3 transformation dq3 = dqtorch.dual_quaternion_mul(dq1_inv, dq1) -print((dq3[0][..., 0]-1).abs().max(), dq3[0][..., 1:].abs().max(), dq3[1].abs().max()) # should be close to 0 +print((dq3[0][..., 0]-1).abs().max(), dq3[0][..., 1:].abs().max(), + dq3[1].abs().max()) # should be close to 0 # apply se3 transformation to points p2 = dqtorch.dual_quaternion_apply(dq1, p1) # dual quaternion inverse dq3 = dqtorch.dual_quaternion_inverse(dq1) -print((dq3[0]-dq1_inv[0]).abs().max(), (dq3[1]-dq1_inv[1]).abs().max()) # should be close to 0 +print((dq3[0]-dq1_inv[0]).abs().max(), + (dq3[1]-dq1_inv[1]).abs().max()) # should be close to 0 # convert from dual quaternion to quaternion translation qr3, t3 = dqtorch.dual_quaternion_to_quaternion_translation(dq1) -print((qr3-qr1).abs().max(), (t3-t1).abs().max()) # should be close to 0 - - - - - - +print((qr3-qr1).abs().max(), (t3-t1).abs().max()) # should be close to 0 diff --git a/setup.py b/setup.py index c548210..a19eedc 100644 --- a/setup.py +++ b/setup.py @@ -5,12 +5,12 @@ _src_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'dqtorch') nvcc_flags = [ - '-O3', '-std=c++14', + '-O3', '-std=c++17', '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', '-U__CUDA_NO_HALF2_OPERATORS__', ] if os.name == "posix": - c_flags = ['-O3', '-std=c++14'] + c_flags = ['-O3', '-std=c++17'] elif os.name == "nt": c_flags = ['/O2', '/std:c++17'] @@ -18,7 +18,8 @@ def find_cl_path(): import glob for edition in ["Enterprise", "Professional", "BuildTools", "Community"]: - paths = sorted(glob.glob(r"C:\\Program Files (x86)\\Microsoft Visual Studio\\*\\%s\\VC\\Tools\\MSVC\\*\\bin\\Hostx64\\x64" % edition), reverse=True) + paths = sorted(glob.glob( + r"C:\\Program Files (x86)\\Microsoft Visual Studio\\*\\%s\\VC\\Tools\\MSVC\\*\\bin\\Hostx64\\x64" % edition), reverse=True) if paths: return paths[0] @@ -26,21 +27,23 @@ def find_cl_path(): if os.system("where cl.exe >nul 2>nul") != 0: cl_path = find_cl_path() if cl_path is None: - raise RuntimeError("Could not locate a supported Microsoft Visual C++ installation") + raise RuntimeError( + "Could not locate a supported Microsoft Visual C++ installation") os.environ["PATH"] += ";" + cl_path LIB_NAME = 'dqtorch' setup( - name=LIB_NAME, # package name, import this to use python API - description='A faster pytorch libraray for (dual) quaternion batched operations.', + name=LIB_NAME, # package name, import this to use python API + description='A faster pytorch library for (dual) quaternion batched operations.', license="MIT", author="Chaoyang Wang", python_requires=">=3.6", + setup_requires=["torch>=1.12"], install_requires=["torch>=1.12"], packages=[LIB_NAME], ext_modules=[ CUDAExtension( - name='_quaternion_cuda', # extension name, import this to use CUDA API + name='_quaternion_cuda', # extension name, import this to use CUDA API sources=[os.path.join(_src_path, 'src', f) for f in [ 'quaternion.cu', 'bindings.cpp', @@ -54,4 +57,4 @@ def find_cl_path(): cmdclass={ 'build_ext': BuildExtension, } -) \ No newline at end of file +) diff --git a/test_speed.py b/test_speed.py index b2a4bfa..f6e6f2b 100644 --- a/test_speed.py +++ b/test_speed.py @@ -1,7 +1,8 @@ import torch.utils.benchmark as benchmark -import dqtorch +import dqtorch import torch + def _forward_and_backward(fun, args): for x in args: x.requires_grad = True @@ -18,7 +19,7 @@ def _forward_and_backward(fun, args): qr_1 = dqtorch.axis_angle_to_quaternion(torch.randn(n_pts, 3).cuda()) qr_2 = dqtorch.axis_angle_to_quaternion(torch.randn(n_pts, 3).cuda()) -# 1. test the speed up of quaternino_mul over native pytorch implementaion +# 1. test the speed up of quaternino_mul over native pytorch implementaion print('------ quaternion_mul --------') t = benchmark.Timer( stmt='_quaternion_mul_pytorch(qr_1, qr_2)', @@ -35,13 +36,13 @@ def _forward_and_backward(fun, args): t = benchmark.Timer( stmt='_forward_and_backward(fun, (qr_1, qr_2))', setup='from __main__ import _forward_and_backward', - globals={'fun':dqtorch._quaternion_mul_pytorch, 'qr_1': qr_1, 'qr_2': qr_2}) + globals={'fun': dqtorch._quaternion_mul_pytorch, 'qr_1': qr_1, 'qr_2': qr_2}) print(t.timeit(200)) t = benchmark.Timer( stmt='_forward_and_backward(fun, (qr_1, qr_2))', setup='from __main__ import _forward_and_backward', - globals={'fun':dqtorch.quaternion_mul, 'qr_1': qr_1, 'qr_2': qr_2}) + globals={'fun': dqtorch.quaternion_mul, 'qr_1': qr_1, 'qr_2': qr_2}) print('native pytorch: ', t.timeit(200)) print('------ quaternion_conjugate --------') @@ -57,8 +58,3 @@ def _forward_and_backward(fun, args): setup='from dqtorch import _quaternion_conjugate_pytorch', globals={'qr_1': qr_1}) print(t.timeit(200)) - - - - - \ No newline at end of file From 55004ed5ff9e1ca1a42c76dd162b474e0c634d90 Mon Sep 17 00:00:00 2001 From: Marisha Date: Fri, 15 Nov 2024 17:52:48 -0600 Subject: [PATCH 2/7] More AutoFormatting --- dqtorch/src/bindings.cpp | 3 +- dqtorch/src/quaternion.cu | 378 ++++++++++++++++++++------------------ dqtorch/src/quaternion.h | 14 +- 3 files changed, 208 insertions(+), 187 deletions(-) diff --git a/dqtorch/src/bindings.cpp b/dqtorch/src/bindings.cpp index 5c1ea8b..f3fac42 100644 --- a/dqtorch/src/bindings.cpp +++ b/dqtorch/src/bindings.cpp @@ -2,7 +2,8 @@ #include "quaternion.h" -PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) +{ m.def("quaternion_mul_forward", &quaternion_mul_forward, "quaternion multiplication forward (CUDA)"); m.def("quaternion_mul_backward", &quaternion_mul_backward, "quaternion multiplication backward (CUDA)"); m.def("quaternion_mul_backward_backward", &quaternion_mul_backward_backward, "quaternion multiplication backward (CUDA)"); diff --git a/dqtorch/src/quaternion.cu b/dqtorch/src/quaternion.cu index c93e691..8af25c3 100644 --- a/dqtorch/src/quaternion.cu +++ b/dqtorch/src/quaternion.cu @@ -12,32 +12,35 @@ #include - #define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor") #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be a contiguous tensor") #define CHECK_IS_INT(x) TORCH_CHECK(x.scalar_type() == at::ScalarType::Int, #x " must be an int tensor") #define CHECK_IS_FLOATING(x) TORCH_CHECK(x.scalar_type() == at::ScalarType::Float || x.scalar_type() == at::ScalarType::Half || x.scalar_type() == at::ScalarType::Double, #x " must be a floating tensor") -#define CHECK_IS_CONTIGUOUS_FLOAT_CUDA(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x); CHECK_IS_FLOATING(x) - +#define CHECK_IS_CONTIGUOUS_FLOAT_CUDA(x) \ + CHECK_CUDA(x); \ + CHECK_CONTIGUOUS(x); \ + CHECK_IS_FLOATING(x) template -__host__ __device__ T div_round_up(T val, T divisor) { +__host__ __device__ T div_round_up(T val, T divisor) +{ return (val + divisor - 1) / divisor; } template __global__ void kernel_quaternion_mul( - const scalar_t * __restrict__ inputs_1, - const scalar_t * __restrict__ inputs_2, - scalar_t * outputs, - uint32_t B, + const scalar_t *__restrict__ inputs_1, + const scalar_t *__restrict__ inputs_2, + scalar_t *outputs, + uint32_t B, uint32_t R1, uint32_t R2, uint32_t D1, - uint32_t D2 -) { + uint32_t D2) +{ const uint32_t b = threadIdx.x + blockIdx.x * blockDim.x; - if (b >= B) return; + if (b >= B) + return; // locate // (B / R1) x D1 is broadcast as B/R1 x R1 x D1 @@ -46,42 +49,47 @@ __global__ void kernel_quaternion_mul( outputs += b * 4; scalar_t aw, ax, ay, az, bw, bx, by, bz = 0; - if (D1 == 3) { + if (D1 == 3) + { aw = 0, ax = inputs_1[0], ay = inputs_1[1], az = inputs_1[2]; - } else { + } + else + { aw = inputs_1[0], ax = inputs_1[1], ay = inputs_1[2], az = inputs_1[3]; } - - if (D2 == 3) { + + if (D2 == 3) + { bw = 0, bx = inputs_2[0], by = inputs_2[1], bz = inputs_2[2]; - } else { + } + else + { bw = inputs_2[0], bx = inputs_2[1], by = inputs_2[2], bz = inputs_2[3]; } outputs[0] = aw * bw - ax * bx - ay * by - az * bz; - outputs[1] = aw * bx + ax * bw + ay * bz - az * by; - outputs[2] = aw * by - ax * bz + ay * bw + az * bx; - outputs[3] = aw * bz + ax * by - ay * bx + az * bw; - + outputs[1] = aw * bx + ax * bw + ay * bz - az * by; + outputs[2] = aw * by - ax * bz + ay * bw + az * bx; + outputs[3] = aw * bz + ax * by - ay * bx + az * bw; } - template __global__ void kernel_quaternion_mul_backward( - const scalar_t * __restrict__ grad, - uint32_t B, + const scalar_t *__restrict__ grad, + uint32_t B, uint32_t R1, uint32_t R2, uint32_t D1, uint32_t D2, - const scalar_t * __restrict__ inputs_1, - const scalar_t * __restrict__ inputs_2, - scalar_t * grad_inputs_1, - scalar_t * grad_inputs_2 -) { + const scalar_t *__restrict__ inputs_1, + const scalar_t *__restrict__ inputs_2, + scalar_t *grad_inputs_1, + scalar_t *grad_inputs_2) +{ const uint32_t t = threadIdx.x + blockIdx.x * blockDim.x; const uint32_t b = t / 4; - if (b >= B) return; + if (b >= B) + return; const uint32_t d = t - b * 4; // locate @@ -90,36 +98,48 @@ __global__ void kernel_quaternion_mul_backward( inputs_2 += (b / R2) * D2; scalar_t aw, ax, ay, az, bw, bx, by, bz = 0; - if (D1 == 3) { + if (D1 == 3) + { aw = 0, ax = inputs_1[0], ay = inputs_1[1], az = inputs_1[2]; - } else { + } + else + { aw = inputs_1[0], ax = inputs_1[1], ay = inputs_1[2], az = inputs_1[3]; } - - if (D2 == 3) { + + if (D2 == 3) + { bw = 0, bx = inputs_2[0], by = inputs_2[1], bz = inputs_2[2]; - } else { + } + else + { bw = inputs_2[0], bx = inputs_2[1], by = inputs_2[2], bz = inputs_2[3]; } grad_inputs_1 += D1 * (b / R1) + d + D1 - 4; grad_inputs_2 += D2 * (b / R2) + d + D2 - 4; - if (d==0) { - if (D1 > 3){ + if (d == 0) + { + if (D1 > 3) + { grad_inputs_1[0] = grad[0] * bw + grad[1] * bx + grad[2] * by + grad[3] * bz; } - if (D2 > 3){ + if (D2 > 3) + { grad_inputs_2[0] = grad[0] * aw + grad[1] * ax + grad[2] * ay + grad[3] * az; - } - } else if (d==1) + } + } + else if (d == 1) { grad_inputs_1[0] = grad[0] * (-bx) + grad[1] * bw + grad[2] * (-bz) + grad[3] * by; grad_inputs_2[0] = grad[0] * (-ax) + grad[1] * aw + grad[2] * az + grad[3] * (-ay); - } else if (d==2) + } + else if (d == 2) { grad_inputs_1[0] = grad[0] * (-by) + grad[1] * bz + grad[2] * bw + grad[3] * (-bx); grad_inputs_2[0] = grad[0] * (-ay) + grad[1] * (-az) + grad[2] * aw + grad[3] * ax; - } else + } + else { grad_inputs_1[0] = grad[0] * (-bz) + grad[1] * (-by) + grad[2] * bx + grad[3] * bw; grad_inputs_2[0] = grad[0] * (-az) + grad[1] * ay + grad[2] * (-ax) + grad[3] * aw; @@ -136,83 +156,97 @@ __global__ void kernel_quaternion_mul_backward( // { // grad_inputs_1[t] = grad[0] * (-by) + grad[1] * bz + grad[2] * bw + grad[3] * (-bx); // grad_inputs_2[t] = grad[0] * (-ay) + grad[1] * (-az) + grad[2] * aw + grad[3] * ax; - // } else + // } else // { // grad_inputs_1[t] = grad[0] * (-bz) + grad[1] * (-by) + grad[2] * bx + grad[3] * bw; // grad_inputs_2[t] = grad[0] * (-az) + grad[1] * ay + grad[2] * (-ax) + grad[3] * aw; // } - } - template __global__ void kernel_quaternion_mul_backward_backward( - const scalar_t * __restrict__ grad_out_1, - const scalar_t * __restrict__ grad_out_2, - uint32_t B, + const scalar_t *__restrict__ grad_out_1, + const scalar_t *__restrict__ grad_out_2, + uint32_t B, uint32_t R1, uint32_t R2, uint32_t D1, uint32_t D2, - const scalar_t * __restrict__ grad, - const scalar_t * __restrict__ inputs_1, - const scalar_t * __restrict__ inputs_2, - scalar_t * grad_grad, - scalar_t * grad_grad_inputs_1, - scalar_t * grad_grad_inputs_2 -) { + const scalar_t *__restrict__ grad, + const scalar_t *__restrict__ inputs_1, + const scalar_t *__restrict__ inputs_2, + scalar_t *grad_grad, + scalar_t *grad_grad_inputs_1, + scalar_t *grad_grad_inputs_2) +{ const uint32_t t = threadIdx.x + blockIdx.x * blockDim.x; const uint32_t b = t / 4; - if (b >= B) return; + if (b >= B) + return; const uint32_t d = t - b * 4; // locate grad += b * 4; - inputs_1 += (b/R1)*D1; - inputs_2 += (b/R2)*D2; - grad_out_1 += (b/R1) * D1; - grad_out_2 += (b/R2) * D2; - + inputs_1 += (b / R1) * D1; + inputs_2 += (b / R2) * D2; + grad_out_1 += (b / R1) * D1; + grad_out_2 += (b / R2) * D2; + scalar_t aw, ax, ay, az, bw, bx, by, bz = 0; scalar_t d_aw, d_ax, d_ay, d_az, d_bw, d_bx, d_by, d_bz = 0; - if (D1 == 3) { + if (D1 == 3) + { aw = 0, ax = inputs_1[0], ay = inputs_1[1], az = inputs_1[2]; d_aw = 0, d_ax = grad_out_1[0], d_ay = grad_out_1[1], d_az = grad_out_1[2]; - } else { + } + else + { aw = inputs_1[0], ax = inputs_1[1], ay = inputs_1[2], az = inputs_1[3]; d_aw = grad_out_1[0], d_ax = grad_out_1[1], d_ay = grad_out_1[2], d_az = grad_out_1[3]; } - - if (D2 == 3) { + + if (D2 == 3) + { bw = 0, bx = inputs_2[0], by = inputs_2[1], bz = inputs_2[2]; d_bw = 0, d_bx = grad_out_2[0], d_by = grad_out_2[1], d_bz = grad_out_2[2]; - } else { + } + else + { bw = inputs_2[0], bx = inputs_2[1], by = inputs_2[2], bz = inputs_2[3]; d_bw = grad_out_2[0], d_bx = grad_out_2[1], d_by = grad_out_2[2], d_bz = grad_out_2[3]; } - grad_grad_inputs_1 += D1 * (b/R1) + d + D1 - 4; - grad_grad_inputs_2 += D2 * (b/R2) + d + D2 - 4; + grad_grad_inputs_1 += D1 * (b / R1) + d + D1 - 4; + grad_grad_inputs_2 += D2 * (b / R2) + d + D2 - 4; - if (d==0) { - if (D1 > 3){ + if (d == 0) + { + if (D1 > 3) + { grad_grad_inputs_1[0] = d_bw * grad[0] + d_bx * grad[1] + d_by * grad[2] + d_bz * grad[3]; } - if (D2 > 3){ + if (D2 > 3) + { grad_grad_inputs_2[0] = d_aw * grad[0] + d_ax * grad[1] + d_ay * grad[2] + d_az * grad[3]; } grad_grad[t] = d_aw * bw + d_bw * aw - d_ax * bx - d_bx * ax - d_ay * by - d_by * ay - d_az * bz - d_bz * az; - } else if (d==1){ + } + else if (d == 1) + { grad_grad_inputs_1[0] = d_bw * grad[1] - d_bx * grad[0] + d_by * grad[3] - d_bz * grad[2]; grad_grad_inputs_2[0] = d_aw * grad[1] - d_ax * grad[0] - d_ay * grad[3] + d_az * grad[2]; grad_grad[t] = d_aw * bx + d_bw * ax + d_ax * bw + d_bx * aw + d_ay * bz - d_by * az - d_az * by + d_bz * ay; - } else if (d==2){ + } + else if (d == 2) + { grad_grad_inputs_1[0] = d_bw * grad[2] - d_bx * grad[3] - d_by * grad[0] + d_bz * grad[1]; grad_grad_inputs_2[0] = d_aw * grad[2] + d_ax * grad[3] - d_ay * grad[0] - d_az * grad[1]; grad_grad[t] = d_aw * by + d_bw * ay - d_ax * bz + d_bx * az + d_ay * bw + d_by * aw + d_az * bx - d_bz * ax; - } else { + } + else + { grad_grad_inputs_1[0] = d_bw * grad[3] + d_bx * grad[2] - d_by * grad[1] - d_bz * grad[0]; grad_grad_inputs_2[0] = d_aw * grad[3] - d_ax * grad[2] + d_ay * grad[1] - d_az * grad[0]; @@ -220,50 +254,53 @@ __global__ void kernel_quaternion_mul_backward_backward( } } - - template __global__ void kernel_quaternion_conjugate( - const scalar_t * __restrict__ inputs, - uint32_t B, - scalar_t * outputs -) { + const scalar_t *__restrict__ inputs, + uint32_t B, + scalar_t *outputs) +{ const uint32_t t = threadIdx.x + blockIdx.x * blockDim.x; const uint32_t b = t / 4; - if (b >= B) return; + if (b >= B) + return; uint32_t d = t - b * 4; - if (d == 0) { + if (d == 0) + { outputs[t] = inputs[t]; - } else { + } + else + { outputs[t] = -inputs[t]; - } + } } - // inputs: [B, D], float, in [0, 1] // outputs: [B, L * C], float template -void quaternion_mul_forward_cuda(const scalar_t *inputs_1, const scalar_t *inputs_2, scalar_t *outputs, const uint32_t B, const uint32_t R1, const uint32_t R2, const uint32_t D1, const uint32_t D2) { +void quaternion_mul_forward_cuda(const scalar_t *inputs_1, const scalar_t *inputs_2, scalar_t *outputs, const uint32_t B, const uint32_t R1, const uint32_t R2, const uint32_t D1, const uint32_t D2) +{ static constexpr uint32_t N_THREADS = 256; kernel_quaternion_mul<<>>(inputs_1, inputs_2, outputs, B, R1, R2, D1, D2); } - template -void quaternion_mul_backward_cuda(const scalar_t *grad, const uint32_t B, const uint32_t R1, const uint32_t R2, const uint32_t D1, const uint32_t D2, const scalar_t *inputs_1, const scalar_t *inputs_2, scalar_t *grad_inputs_1, scalar_t *grad_inputs_2) { +void quaternion_mul_backward_cuda(const scalar_t *grad, const uint32_t B, const uint32_t R1, const uint32_t R2, const uint32_t D1, const uint32_t D2, const scalar_t *inputs_1, const scalar_t *inputs_2, scalar_t *grad_inputs_1, scalar_t *grad_inputs_2) +{ static constexpr uint32_t N_THREADS = 256; kernel_quaternion_mul_backward<<>>(grad, B, R1, R2, D1, D2, inputs_1, inputs_2, grad_inputs_1, grad_inputs_2); } template void quaternion_mul_backward_backward_cuda( - const scalar_t *grad_out_1, const scalar_t *grad_out_2, - const uint32_t B, const uint32_t R1, const uint32_t R2, const uint32_t D1, const uint32_t D2, - const scalar_t *grad, const scalar_t *inputs_1, const scalar_t *inputs_2, - scalar_t *grad_grad, scalar_t *grad_grad_inputs_1, scalar_t *grad_grad_inputs_2) { + const scalar_t *grad_out_1, const scalar_t *grad_out_2, + const uint32_t B, const uint32_t R1, const uint32_t R2, const uint32_t D1, const uint32_t D2, + const scalar_t *grad, const scalar_t *inputs_1, const scalar_t *inputs_2, + scalar_t *grad_grad, scalar_t *grad_grad_inputs_1, scalar_t *grad_grad_inputs_2) +{ static constexpr uint32_t N_THREADS = 256; kernel_quaternion_mul_backward_backward<<>>( - grad_out_1, grad_out_2, B, R1, R2, D1, D2, + grad_out_1, grad_out_2, B, R1, R2, D1, D2, grad, inputs_1, inputs_2, grad_grad, grad_grad_inputs_1, grad_grad_inputs_2); } @@ -272,136 +309,123 @@ template void quaternion_conjugate_cuda( const scalar_t *inputs, const uint32_t B, - scalar_t *outputs -) { + scalar_t *outputs) +{ static constexpr uint32_t N_THREADS = 256; - kernel_quaternion_conjugate<<>>(inputs, B, outputs); + kernel_quaternion_conjugate<<>>(inputs, B, outputs); } - - - - - - - -void quaternion_mul_forward(at::Tensor inputs_1, at::Tensor inputs_2, at::Tensor outputs, const uint32_t B, const uint32_t R1, const uint32_t R2, const uint32_t D1, const uint32_t D2) { - CHECK_CUDA(inputs_1); +void quaternion_mul_forward(at::Tensor inputs_1, at::Tensor inputs_2, at::Tensor outputs, const uint32_t B, const uint32_t R1, const uint32_t R2, const uint32_t D1, const uint32_t D2) +{ + CHECK_CUDA(inputs_1); CHECK_CUDA(inputs_2); - CHECK_CUDA(outputs); + CHECK_CUDA(outputs); - - CHECK_CONTIGUOUS(inputs_1); + CHECK_CONTIGUOUS(inputs_1); CHECK_CONTIGUOUS(inputs_2); - CHECK_CONTIGUOUS(outputs); + CHECK_CONTIGUOUS(outputs); - - CHECK_IS_FLOATING(inputs_1); + CHECK_IS_FLOATING(inputs_1); CHECK_IS_FLOATING(inputs_2); - CHECK_IS_FLOATING(outputs); - - AT_DISPATCH_FLOATING_TYPES_AND_HALF( - inputs_1.scalar_type(), "quaternion_mul_forward_cuda", ([&] { - quaternion_mul_forward_cuda( - inputs_1.data_ptr(), - inputs_2.data_ptr(), - outputs.data_ptr(), - B, R1, R2, D1, D2); - })); + CHECK_IS_FLOATING(outputs); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + inputs_1.scalar_type(), "quaternion_mul_forward_cuda", ([&] + { quaternion_mul_forward_cuda( + inputs_1.data_ptr(), + inputs_2.data_ptr(), + outputs.data_ptr(), + B, R1, R2, D1, D2); })); } void quaternion_mul_backward_backward( - at::Tensor grad_out_1, at::Tensor grad_out_2, - const uint32_t B, const uint32_t R1, const uint32_t R2, const uint32_t D1, const uint32_t D2, + at::Tensor grad_out_1, at::Tensor grad_out_2, + const uint32_t B, const uint32_t R1, const uint32_t R2, const uint32_t D1, const uint32_t D2, at::Tensor grad, at::Tensor inputs_1, at::Tensor inputs_2, - at::Tensor grad_grad, at::Tensor grad_grad_inputs_1, at::Tensor grad_grad_inputs_2) { + at::Tensor grad_grad, at::Tensor grad_grad_inputs_1, at::Tensor grad_grad_inputs_2) +{ - CHECK_CUDA(grad_out_1); - CHECK_CUDA(grad_out_2); + CHECK_CUDA(grad_out_1); + CHECK_CUDA(grad_out_2); CHECK_CUDA(grad); CHECK_CUDA(inputs_1); CHECK_CUDA(inputs_2); CHECK_CUDA(grad_grad); - CHECK_CUDA(grad_grad_inputs_1); + CHECK_CUDA(grad_grad_inputs_1); CHECK_CUDA(grad_grad_inputs_2); - CHECK_CONTIGUOUS(grad_out_1); - CHECK_CONTIGUOUS(grad_out_2); + CHECK_CONTIGUOUS(grad_out_1); + CHECK_CONTIGUOUS(grad_out_2); CHECK_CONTIGUOUS(grad); CHECK_CONTIGUOUS(inputs_1); CHECK_CONTIGUOUS(inputs_2); CHECK_CONTIGUOUS(grad_grad); - CHECK_CONTIGUOUS(grad_grad_inputs_1); + CHECK_CONTIGUOUS(grad_grad_inputs_1); CHECK_CONTIGUOUS(grad_grad_inputs_2); - CHECK_IS_FLOATING(grad_out_1); - CHECK_IS_FLOATING(grad_out_2); + CHECK_IS_FLOATING(grad_out_1); + CHECK_IS_FLOATING(grad_out_2); CHECK_IS_FLOATING(grad); CHECK_IS_FLOATING(inputs_1); CHECK_IS_FLOATING(inputs_2); CHECK_IS_FLOATING(grad_grad); - CHECK_IS_FLOATING(grad_grad_inputs_1); + CHECK_IS_FLOATING(grad_grad_inputs_1); CHECK_IS_FLOATING(grad_grad_inputs_2); - AT_DISPATCH_FLOATING_TYPES_AND_HALF( - grad.scalar_type(), "quaternion_mul_backward_backward_cuda", ([&] { - quaternion_mul_backward_backward_cuda( - grad_out_1.data_ptr(), - grad_out_2.data_ptr(), - B, R1, R2, D1, D2, - grad.data_ptr(), - inputs_1.data_ptr(), - inputs_2.data_ptr(), - grad_grad.data_ptr(), - grad_grad_inputs_1.data_ptr(), - grad_grad_inputs_2.data_ptr()); - })); + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + grad.scalar_type(), "quaternion_mul_backward_backward_cuda", ([&] + { quaternion_mul_backward_backward_cuda( + grad_out_1.data_ptr(), + grad_out_2.data_ptr(), + B, R1, R2, D1, D2, + grad.data_ptr(), + inputs_1.data_ptr(), + inputs_2.data_ptr(), + grad_grad.data_ptr(), + grad_grad_inputs_1.data_ptr(), + grad_grad_inputs_2.data_ptr()); })); } - - -void quaternion_mul_backward(at::Tensor grad, const uint32_t B, const uint32_t R1, const uint32_t R2, const uint32_t D1, const uint32_t D2, at::Tensor inputs_1, at::Tensor inputs_2, at::Tensor grad_inputs_1, at::Tensor grad_inputs_2) { - CHECK_CUDA(grad); - CHECK_CUDA(inputs_1); +void quaternion_mul_backward(at::Tensor grad, const uint32_t B, const uint32_t R1, const uint32_t R2, const uint32_t D1, const uint32_t D2, at::Tensor inputs_1, at::Tensor inputs_2, at::Tensor grad_inputs_1, at::Tensor grad_inputs_2) +{ + CHECK_CUDA(grad); + CHECK_CUDA(inputs_1); CHECK_CUDA(inputs_2); - CHECK_CUDA(grad_inputs_1); + CHECK_CUDA(grad_inputs_1); CHECK_CUDA(grad_inputs_2); - CHECK_CONTIGUOUS(grad); - CHECK_CONTIGUOUS(inputs_1); + CHECK_CONTIGUOUS(grad); + CHECK_CONTIGUOUS(inputs_1); CHECK_CONTIGUOUS(inputs_2); - CHECK_CONTIGUOUS(grad_inputs_1); + CHECK_CONTIGUOUS(grad_inputs_1); CHECK_CONTIGUOUS(grad_inputs_2); - CHECK_IS_FLOATING(grad); - CHECK_IS_FLOATING(inputs_1); + CHECK_IS_FLOATING(grad); + CHECK_IS_FLOATING(inputs_1); CHECK_IS_FLOATING(inputs_2); - CHECK_IS_FLOATING(grad_inputs_1); + CHECK_IS_FLOATING(grad_inputs_1); CHECK_IS_FLOATING(grad_inputs_2); - AT_DISPATCH_FLOATING_TYPES_AND_HALF( - grad.scalar_type(), "quaternion_mul_backward_cuda", ([&] { - quaternion_mul_backward_cuda( - grad.data_ptr(), - B, R1, R2, D1, D2, - inputs_1.data_ptr(), - inputs_2.data_ptr(), - grad_inputs_1.data_ptr(), - grad_inputs_2.data_ptr()); - })); + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + grad.scalar_type(), "quaternion_mul_backward_cuda", ([&] + { quaternion_mul_backward_cuda( + grad.data_ptr(), + B, R1, R2, D1, D2, + inputs_1.data_ptr(), + inputs_2.data_ptr(), + grad_inputs_1.data_ptr(), + grad_inputs_2.data_ptr()); })); } - -void quaternion_conjugate(at::Tensor inputs, const uint32_t B, at::Tensor outputs) { +void quaternion_conjugate(at::Tensor inputs, const uint32_t B, at::Tensor outputs) +{ CHECK_IS_CONTIGUOUS_FLOAT_CUDA(inputs); CHECK_IS_CONTIGUOUS_FLOAT_CUDA(outputs); - AT_DISPATCH_FLOATING_TYPES_AND_HALF( - inputs.scalar_type(), "quaternion_conjugate_cuda", ([&] { - quaternion_conjugate_cuda( - inputs.data_ptr(), - B, - outputs.data_ptr()); - })); - + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + inputs.scalar_type(), "quaternion_conjugate_cuda", ([&] + { quaternion_conjugate_cuda( + inputs.data_ptr(), + B, + outputs.data_ptr()); })); } \ No newline at end of file diff --git a/dqtorch/src/quaternion.h b/dqtorch/src/quaternion.h index f144029..1d7de9c 100644 --- a/dqtorch/src/quaternion.h +++ b/dqtorch/src/quaternion.h @@ -1,20 +1,16 @@ -# pragma once +#pragma once #include #include - void quaternion_mul_forward(at::Tensor inputs_1, at::Tensor inputs_2, at::Tensor outputs, const uint32_t B, const uint32_t R1, const uint32_t R2, const uint32_t D1, const uint32_t D2); - -void quaternion_mul_backward(at::Tensor grad, const uint32_t B, const uint32_t R1, const uint32_t R2, const uint32_t D1, const uint32_t D2, at::Tensor inputs_1, at::Tensor inputs_2, at::Tensor grad_inputs_1, at::Tensor grad_inputs_2); - +void quaternion_mul_backward(at::Tensor grad, const uint32_t B, const uint32_t R1, const uint32_t R2, const uint32_t D1, const uint32_t D2, at::Tensor inputs_1, at::Tensor inputs_2, at::Tensor grad_inputs_1, at::Tensor grad_inputs_2); void quaternion_mul_backward_backward( - at::Tensor grad_out_1, at::Tensor grad_out_2, - const uint32_t B, const uint32_t R1, const uint32_t R2, const uint32_t D1, const uint32_t D2, - at::Tensor grad, at::Tensor inputs_1, at::Tensor inputs_2, + at::Tensor grad_out_1, at::Tensor grad_out_2, + const uint32_t B, const uint32_t R1, const uint32_t R2, const uint32_t D1, const uint32_t D2, + at::Tensor grad, at::Tensor inputs_1, at::Tensor inputs_2, at::Tensor grad_grad, at::Tensor grad_grad_inputs_1, at::Tensor grad_grad_inputs_2); - void quaternion_conjugate(at::Tensor inputs, const uint32_t B, at::Tensor outputs); \ No newline at end of file From 0a4c107dc9890eff1982969f05ac8a21dd539d6e Mon Sep 17 00:00:00 2001 From: Marisha Date: Fri, 15 Nov 2024 17:53:30 -0600 Subject: [PATCH 3/7] Automatically set cuda arch list to supported gpu to prevent warn --- dqtorch/backend.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dqtorch/backend.py b/dqtorch/backend.py index 8a6bc69..e49fdd4 100644 --- a/dqtorch/backend.py +++ b/dqtorch/backend.py @@ -1,6 +1,10 @@ import os +import torch from torch.utils.cpp_extension import load +major, minor = torch.cuda.get_device_capability() +os.environ["TORCH_CUDA_ARCH_LIST"] = f"{major}.{minor}" + _src_path = os.path.dirname(os.path.abspath(__file__)) nvcc_flags = [ From db7a65fb6cbd71745c2da91edcf4e890177c91e3 Mon Sep 17 00:00:00 2001 From: Marisha Date: Fri, 15 Nov 2024 17:58:15 -0600 Subject: [PATCH 4/7] Added requirements.txt for building and running --- requirements.txt | Bin 0 -> 598 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..6da62f27797656cf83c3b55b6717cbe1d0c2c563 GIT binary patch literal 598 zcma)(-Acny5QOJi@JWP{#8&ab7ZAMi0i@K#zcwvRV%nEizuA)}3SP)zPYye?v$LD8 z&q}Xat0cBs>0P&+OU;xI_qongsp0HktpYEUsn8r&L1Z%L4xL(WT9TVIW~dmwC%;F% zL*;>Yg_c>a&xu8_HlQ2ORxg1Xv6s}LrlwVoPb(e9E=j{(>d}L>U7ZiNUy0GtVW;D$ zU+5~j7~RD#yor9mGq7pKCLDN%7S5KPHNm^ap#I7n7r1r9+~h!=aOBX>eGz=+c!k|7d`Gv M&U#dz#}o Date: Fri, 15 Nov 2024 23:54:48 -0600 Subject: [PATCH 5/7] Made requirements.txt more flexible --- requirements.txt | Bin 598 -> 598 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/requirements.txt b/requirements.txt index 6da62f27797656cf83c3b55b6717cbe1d0c2c563..f8ce2ae4e23f7a80abb41ae25e61087a88877a90 100644 GIT binary patch delta 180 zcmcb{a*f6K|GzYbOokkWR3Of0NM^`puw$@gFlNwWFocr43|tIp48=fL092LCU^mfN znbB@yxGIpSl?4*B6@kQFX&`YE#QH7IXg67!QC%KnkU4_^&?pE78C1+r$&d>)sFJ~M jaw6kV1CSaMpbAT{Dv*kDh76!JAm@POKsrqrOyRNs4s9aU delta 180 zcmcb{a*f6K|GzYbOokkWR3Of0NM^`pumwV620aEtD9Ovf#gN8O41@(hRmlvt6OEM_ zZ6}7S0*P8#ATe7JNbHpc5;sAt-|~#MlcgEe Date: Fri, 29 Nov 2024 22:14:49 -0600 Subject: [PATCH 6/7] Removed unused imports and added .venv to gitignore --- .gitignore | 1 + dqtorch/dqtorch.py | 5 +---- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 007a939..fe7e7d7 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ __pycache__/ # Ignore environment files .env +.venv venv/ env/ pip-lockfile.txt diff --git a/dqtorch/dqtorch.py b/dqtorch/dqtorch.py index 5339e79..a69b7d2 100644 --- a/dqtorch/dqtorch.py +++ b/dqtorch/dqtorch.py @@ -1,12 +1,9 @@ import torch -from typing import Tuple, Optional, Union +from typing import Tuple from .quaternion_cuda import quaternion_mul as _quaternion_mul_cuda from .quaternion_cuda import quaternion_conjugate as _quaternion_conjugate_cuda -from enum import Enum, unique - - Quaternion = torch.Tensor DualQuaternions = Tuple[Quaternion, Quaternion] QuaternionTranslation = Tuple[Quaternion, torch.Tensor] From 033c2308ff7b251e4d55522e424c66a654a7daae Mon Sep 17 00:00:00 2001 From: Marisha Date: Fri, 29 Nov 2024 23:44:07 -0600 Subject: [PATCH 7/7] =?UTF-8?q?Updated=20install=20method=20to=20be=20in?= =?UTF-8?q?=20line=20with=20modern=20practice,=20then=20shook=20things=20a?= =?UTF-8?q?round=20until=20it=20build=20correctly=20=F0=9F=91=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 5 ++- pyproject.toml | 16 +++++++ requirements.txt | Bin 598 -> 356 bytes setup.py | 109 +++++++++++++++++++++++++---------------------- 4 files changed, 79 insertions(+), 51 deletions(-) create mode 100644 pyproject.toml diff --git a/README.md b/README.md index fc4dd76..c3715db 100644 --- a/README.md +++ b/README.md @@ -26,8 +26,11 @@ tested in Pytorch 2.5.1+cu124, CUDA-12.4, gcc-9.2.0 ### Install ``` -python -m pip install . +pip install -r requirements.txt +pip install . --no-build-isolation ``` +And make sure that there's no lock file left over in torch_extensions/Cache/py312_cu124/_quaternion_cuda before running a program. +Signed, several hours of my life ### Test ``` python examples.py diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..a84086b --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,16 @@ +[build-system] +requires = ["setuptools>=75.6.0", "wheel>=0.45.1", "ninja>=1.11.1.2"] +build-backend = "setuptools.build_meta" + +[project] +name = "dqtorch" +version = "0.1.0" +description = "A faster pytorch library for (dual) quaternion batched operations." +authors = [{ name = "Chaoyang Wang" }, { name = "Marisha Norcross" }] +license = { text = "MIT" } +requires-python = ">=3.12" +dependencies = ["torch==2.5.1+cu124", "ninja>=1.11.1.2"] + +[project.urls] +Homepage = "https://github.com/Wirlocke/dqtorch" +Documentation = "https://github.com/Wirlocke/dqtorch#readme" diff --git a/requirements.txt b/requirements.txt index f8ce2ae4e23f7a80abb41ae25e61087a88877a90..48ca0737e72885b3ed6f509fac08ca7e6b8eddf3 100644 GIT binary patch literal 356 zcmZXQ%?^Sv5QOJ!;yZXZtridy6MPo}jX$YCqy~9;b#~!kG)-x{`_1kw@0Wpr#}g5E zl)PWipw@`NgRy`W&df%1=yAY?t451~T;N&qWtkgEwB#$&R&OVg17Cq>#aa#tGAyz9 zA`29ZTI{$T)<0ZNACbyV6)h|ERCCH8YFOyT+lp9Gyn=e|PygO{+aH$t@n!1Xfh$ zy5OzjYm+rMGjGlNGv4ZdSA4*ZUJ<+|<*@UwJ||Be^r9yc9_d^%%dtLXw7-2bCG2ru OvUZ-|beH_YG5nul 2>nul") != 0: - cl_path = find_cl_path() - if cl_path is None: - raise RuntimeError( - "Could not locate a supported Microsoft Visual C++ installation") - os.environ["PATH"] += ";" + cl_path - -LIB_NAME = 'dqtorch' setup( - name=LIB_NAME, # package name, import this to use python API - description='A faster pytorch library for (dual) quaternion batched operations.', - license="MIT", - author="Chaoyang Wang", - python_requires=">=3.6", - setup_requires=["torch>=1.12"], - install_requires=["torch>=1.12"], - packages=[LIB_NAME], - ext_modules=[ - CUDAExtension( - name='_quaternion_cuda', # extension name, import this to use CUDA API - sources=[os.path.join(_src_path, 'src', f) for f in [ - 'quaternion.cu', - 'bindings.cpp', - ]], - extra_compile_args={ - 'cxx': c_flags, - 'nvcc': nvcc_flags, - } - ), - ], + name='dqtorch', + version='0.1.0', + packages=find_packages(), + ext_modules=ext_modules, cmdclass={ 'build_ext': BuildExtension, - } + }, + package_data={ + 'dqtorch': ['src/*.cpp', 'src/*.cu', 'src/*.h'], + }, + zip_safe=False, )