Merged PR 5333: Fix truncation of varchar/binary at full capacity + Add related tests

Theekshna Kotian · Theekshna Kotian · commit 7056dcad50d5 · 2025-02-26T11:52:03.000Z
Currently, if you add a string of size 10 to VARCHAR(10) column, &amp; try to fetch it, you'll get a truncated string of size 9. This PR fixes this case &amp; adds related tests.

----
#### AI description  (iteration 1)
#### PR Classification
Bug fix

#### PR Summary
This pull request addresses the truncation issue of `VARCHAR` and `VARBINARY` types at full capacity and adds related tests to ensure proper functionality.
- Added tests in `/tests/test_004_cursor.py` to verify `VARCHAR`, `NVARCHAR`, and `VARBINARY` columns can handle values at their full capacity.
- Modified `/mssql_python/pybind/ddbc_bindings.cpp` to correctly handle buffer sizes and null-termination for `VARCHAR`, `NVARCHAR`, and `VARBINARY` types during data fetching and binding.
&lt;!-- GitOpsUserAgent=GitOps.Apps.Server.pullrequestcopilot --&gt;

Related work items: #33942
diff --git a/mssql_python/cursor.py b/mssql_python/cursor.py
@@ -148,68 +148,68 @@ def _parse_time(self, param):
                 continue
         return None
     
-    def _parse_timestamptz(self, param):
-        """
-        Attempt to parse a string as a timestamp with time zone (timestamptz).
-        
-        Args:
-            param: The string to parse.
-        
-        Returns:
-            A datetime.datetime object if parsing is successful, else None.
-        """
-        formats = [
-            "%Y-%m-%dT%H:%M:%S%z",      # ISO 8601 datetime with timezone offset
-            "%Y-%m-%d %H:%M:%S.%f%z",   # Datetime with fractional seconds and timezone offset
-        ]
-        for fmt in formats:
-            try:
-                return datetime.datetime.strptime(param, fmt)
-            except ValueError:
-                continue
-        return None
-
-    def _parse_smalldatetime(self, param):
-        """
-        Attempt to parse a string as a smalldatetime.
-        
-        Args:
-            param: The string to parse.
-        
-        Returns:
-            A datetime.datetime object if parsing is successful, else None.
-        """
-        formats = [
-            "%Y-%m-%d %H:%M:%S",        # Standard datetime
-        ]
-        for fmt in formats:
-            try:
-                return datetime.datetime.strptime(param, fmt)
-            except ValueError:
-                continue
-        return None
-
-    def _parse_datetime2(self, param):
-        """
-        Attempt to parse a string as a datetime2.
-        
-        Args:
-            param: The string to parse.
-        
-        Returns:
-            A datetime.datetime object if parsing is successful, else None.
-        """
-        formats = [
-            "%Y-%m-%d %H:%M:%S.%f",     # Datetime with fractional seconds (up to 6 digits)
-        ]
-        for fmt in formats:
-            try:
-                dt = datetime.datetime.strptime(param, fmt)
-                if fmt == "%Y-%m-%d %H:%M:%S.%f" and len(param.split('.')[-1]) > 3:
-                    return dt
-            except ValueError:
-                continue
-        return None
+    # def _parse_timestamptz(self, param):
+    #     """
+    #     Attempt to parse a string as a timestamp with time zone (timestamptz).
+    #     
+    #     Args:
+    #         param: The string to parse.
+    #     
+    #     Returns:
+    #         A datetime.datetime object if parsing is successful, else None.
+    #     """
+    #     formats = [
+    #         "%Y-%m-%dT%H:%M:%S%z",      # ISO 8601 datetime with timezone offset
+    #         "%Y-%m-%d %H:%M:%S.%f%z",   # Datetime with fractional seconds and timezone offset
+    #     ]
+    #     for fmt in formats:
+    #         try:
+    #             return datetime.datetime.strptime(param, fmt)
+    #         except ValueError:
+    #             continue
+    #     return None
+
+    # def _parse_smalldatetime(self, param):
+    #     """
+    #     Attempt to parse a string as a smalldatetime.
+    #     
+    #     Args:
+    #         param: The string to parse.
+    #     
+    #     Returns:
+    #         A datetime.datetime object if parsing is successful, else None.
+    #     """
+    #     formats = [
+    #         "%Y-%m-%d %H:%M:%S",        # Standard datetime
+    #     ]
+    #     for fmt in formats:
+    #         try:
+    #             return datetime.datetime.strptime(param, fmt)
+    #         except ValueError:
+    #             continue
+    #     return None
+
+    # def _parse_datetime2(self, param):
+    #     """
+    #     Attempt to parse a string as a datetime2.
+    #     
+    #     Args:
+    #         param: The string to parse.
+    #     
+    #     Returns:
+    #         A datetime.datetime object if parsing is successful, else None.
+    #     """
+    #     formats = [
+    #         "%Y-%m-%d %H:%M:%S.%f",     # Datetime with fractional seconds (up to 6 digits)
+    #     ]
+    #     for fmt in formats:
+    #         try:
+    #             dt = datetime.datetime.strptime(param, fmt)
+    #             if fmt == "%Y-%m-%d %H:%M:%S.%f" and len(param.split('.')[-1]) > 3:
+    #                 return dt
+    #         except ValueError:
+    #             continue
+    #     return None
 
     def _get_numeric_data(self, param):
         """
@@ -326,6 +326,7 @@ def _map_sql_type(self, param, parameters_list, i):
 
             # String mapping logic here
             is_unicode = self._is_unicode_string(param)
+            # TODO: revisit
             if len(param) > 4000:  # Long strings
                 if is_unicode:
                     return odbc_sql_const.SQL_WLONGVARCHAR.value, odbc_sql_const.SQL_C_WCHAR.value, len(param), 0
@@ -348,8 +349,8 @@ def _map_sql_type(self, param, parameters_list, i):
             else:
                 return odbc_sql_const.SQL_BINARY.value, odbc_sql_const.SQL_C_BINARY.value, len(param), 0
         
-        elif isinstance(param, uuid.UUID):  # Handle uniqueidentifier
-            return odbc_sql_const.SQL_GUID.value, odbc_sql_const.SQL_C_GUID.value, 36, 0
+        # elif isinstance(param, uuid.UUID):  # Handle uniqueidentifier
+        #     return odbc_sql_const.SQL_GUID.value, odbc_sql_const.SQL_C_GUID.value, 36, 0
         
         elif isinstance(param, datetime.datetime):
             # Always keep datetime.datetime check before datetime.date check since datetime.datetime is a subclass of datetime (isinstance(datetime.datetime, datetime.date) returns True)
diff --git a/mssql_python/pybind/ddbc_bindings.cpp b/mssql_python/pybind/ddbc_bindings.cpp
@@ -420,6 +420,8 @@ SQLRETURN BindParameters(SQLHANDLE hStmt, const py::list& params,
                 if (!py::isinstance<py::none>(param)) {
                     ThrowStdException(MakeParamMismatchErrorStr(paramInfo.paramCType, paramIndex));
                 }
+                // TODO: This wont work for None values added to BINARY/VARBINARY columns. None values
+                //       of binary columns need to have C type = SQL_C_BINARY & SQL type = SQL_BINARY
                 dataPtr = nullptr;
                 strLenOrIndPtr = AllocateParamBuffer<SQLLEN>(paramBuffers);
                 *strLenOrIndPtr = SQL_NULL_DATA;
@@ -918,19 +920,21 @@ SQLRETURN SQLGetData_wrap(intptr_t StatementHandle, SQLUSMALLINT colCount, py::l
             case SQL_LONGVARCHAR: {
                 // TODO: revisit
                 HandleZeroColumnSizeAtFetch(columnSize);
-                std::vector<SQLCHAR> dataBuffer(columnSize + 1);
+		uint64_t fetchBufferSize = columnSize + 1 /* null-termination */;
+                std::vector<SQLCHAR> dataBuffer(fetchBufferSize);
                 SQLLEN dataLen;
                 // TODO: Handle the return code better
-                ret = SQLGetData_ptr(hStmt, i, SQL_C_CHAR, dataBuffer.data(), dataBuffer.size() - 1,
+                ret = SQLGetData_ptr(hStmt, i, SQL_C_CHAR, dataBuffer.data(), dataBuffer.size(),
                                      &dataLen);
 
                 if (SQL_SUCCEEDED(ret)) {
                     // TODO: Refactor these if's across other switches to avoid code duplication
                     // columnSize is in chars, dataLen is in bytes
                     if (dataLen > 0) {
-                        int numCharsInData = dataLen / sizeof(SQLCHAR);
+                        uint64_t numCharsInData = dataLen / sizeof(SQLCHAR);
+                        // NOTE: dataBuffer.size() includes null-terminator, dataLen doesn't. Hence use '<'.
 						if (numCharsInData < dataBuffer.size()) {
-							dataBuffer[numCharsInData] = '\0';  // Null-terminate
+                            // SQLGetData will null-terminate the data
                             row.append(std::string(reinterpret_cast<char*>(dataBuffer.data())));
 						} else {
                             // In this case, buffer size is smaller, and data to be retrieved is longer
@@ -962,17 +966,18 @@ SQLRETURN SQLGetData_wrap(intptr_t StatementHandle, SQLUSMALLINT colCount, py::l
 			case SQL_WLONGVARCHAR: {
                 // TODO: revisit
                 HandleZeroColumnSizeAtFetch(columnSize);
-                std::vector<SQLWCHAR> dataBuffer(columnSize + 1);
+		uint64_t fetchBufferSize = columnSize + 1 /* null-termination */;
+                std::vector<SQLWCHAR> dataBuffer(fetchBufferSize);
                 SQLLEN dataLen;
                 ret = SQLGetData_ptr(hStmt, i, SQL_C_WCHAR, dataBuffer.data(),
-                                     (dataBuffer.size() - 1) * sizeof(SQLWCHAR), &dataLen);
+                                     dataBuffer.size() * sizeof(SQLWCHAR), &dataLen);
 
                 if (SQL_SUCCEEDED(ret)) {
                     // TODO: Refactor these if's across other switches to avoid code duplication
                     if (dataLen > 0) {
-                        int numCharsInData = dataLen / sizeof(SQLWCHAR);
+                        uint64_t numCharsInData = dataLen / sizeof(SQLWCHAR);
 						if (numCharsInData < dataBuffer.size()) {
-							dataBuffer[numCharsInData] = L'\0';  // Null-terminate
+                            // SQLGetData will null-terminate the data
                             row.append(std::wstring(dataBuffer.data()));
 						} else {
                             // In this case, buffer size is smaller, and data to be retrieved is longer
@@ -1273,24 +1278,37 @@ SQLRETURN SQLBindColums(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& column
         switch (dataType) {
             case SQL_CHAR:
             case SQL_VARCHAR:
-            case SQL_LONGVARCHAR:
+            case SQL_LONGVARCHAR: {
                 // TODO: handle variable length data correctly. This logic wont suffice
                 HandleZeroColumnSizeAtFetch(columnSize);
-                buffers.charBuffers[col - 1].resize(fetchSize * (columnSize + 1 /*null-terminator*/));
+                uint64_t fetchBufferSize = columnSize + 1 /*null-terminator*/;
+		// TODO: For LONGVARCHAR/BINARY types, columnSize is returned as 2GB-1 by
+		// SQLDescribeCol. So fetchBufferSize = 2GB. fetchSize=1 if columnSize>1GB.
+		// So we'll allocate a vector of size 2GB. If a query fetches multiple (say N)
+		// LONG... columns, we will have allocated multiple (N) 2GB sized vectors. This
+		// will make driver very slow. And if the N is high enough, we could hit the OS
+		// limit for heap memory that we can allocate, & hence get a std::bad_alloc. The
+		// process could also be killed by OS for consuming too much memory.
+		// Hence this will be revisited in beta to not allocate 2GB+ memory,
+		// & use streaming instead
+                buffers.charBuffers[col - 1].resize(fetchSize * fetchBufferSize);
                 ret = SQLBindCol_ptr(hStmt, col, SQL_C_CHAR, buffers.charBuffers[col - 1].data(),
-                                     (columnSize) * sizeof(SQLCHAR),
+                                     fetchBufferSize * sizeof(SQLCHAR),
                                      buffers.indicators[col - 1].data());
                 break;
+            }
             case SQL_WCHAR:
             case SQL_WVARCHAR:
-            case SQL_WLONGVARCHAR:
+            case SQL_WLONGVARCHAR: {
                 // TODO: handle variable length data correctly. This logic wont suffice
                 HandleZeroColumnSizeAtFetch(columnSize);
-                buffers.wcharBuffers[col - 1].resize(fetchSize * (columnSize + 1 /*null-terminator*/));
+                uint64_t fetchBufferSize = columnSize + 1 /*null-terminator*/;
+                buffers.wcharBuffers[col - 1].resize(fetchSize * fetchBufferSize);
                 ret = SQLBindCol_ptr(hStmt, col, SQL_C_WCHAR, buffers.wcharBuffers[col - 1].data(),
-                                     (columnSize) * sizeof(SQLWCHAR),
+                                     fetchBufferSize * sizeof(SQLWCHAR),
                                      buffers.indicators[col - 1].data());
                 break;
+            }
             case SQL_INTEGER:
                 buffers.intBuffers[col - 1].resize(fetchSize);
                 ret = SQLBindCol_ptr(hStmt, col, SQL_C_SLONG, buffers.intBuffers[col - 1].data(),
@@ -1439,12 +1457,13 @@ SQLRETURN FetchBatchData(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& colum
                     // TODO: variable length data needs special handling, this logic wont suffice
                     SQLULEN columnSize = columnMeta["ColumnSize"].cast<SQLULEN>();
                     HandleZeroColumnSizeAtFetch(columnSize);
-					int numCharsInData = dataLen / sizeof(SQLCHAR);
-                    if (numCharsInData <= columnSize) {
-						buffers.charBuffers[col - 1][(i * columnSize) + numCharsInData] =
-                            '\0';  // Null-terminate
+                    uint64_t fetchBufferSize = columnSize + 1 /*null-terminator*/;
+					uint64_t numCharsInData = dataLen / sizeof(SQLCHAR);
+					// fetchBufferSize includes null-terminator, numCharsInData doesn't. Hence '<'
+                    if (numCharsInData < fetchBufferSize) {
+                        // SQLFetch will nullterminate the data
                         row.append(std::string(
-                            reinterpret_cast<char*>(&buffers.charBuffers[col - 1][i * columnSize]),
+                            reinterpret_cast<char*>(&buffers.charBuffers[col - 1][i * fetchBufferSize]),
                             numCharsInData));
                     } else {
                         // In this case, buffer size is smaller, and data to be retrieved is longer
@@ -1463,13 +1482,13 @@ SQLRETURN FetchBatchData(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& colum
                     // TODO: variable length data needs special handling, this logic wont suffice
                     SQLULEN columnSize = columnMeta["ColumnSize"].cast<SQLULEN>();
                     HandleZeroColumnSizeAtFetch(columnSize);
-					int numCharsInData = dataLen / sizeof(SQLWCHAR);
-                    if (numCharsInData <= columnSize) {
-                        buffers.wcharBuffers[col - 1]
-                                            [(i * columnSize) + numCharsInData] =
-                            L'\0';  // Null-terminate
+                    uint64_t fetchBufferSize = columnSize + 1 /*null-terminator*/;
+					uint64_t numCharsInData = dataLen / sizeof(SQLWCHAR);
+					// fetchBufferSize includes null-terminator, numCharsInData doesn't. Hence '<'
+                    if (numCharsInData < fetchBufferSize) {
+                        // SQLFetch will nullterminate the data
                         row.append(std::wstring(
-                            reinterpret_cast<wchar_t*>(&buffers.wcharBuffers[col - 1][i * columnSize]),
+                            reinterpret_cast<wchar_t*>(&buffers.wcharBuffers[col - 1][i * fetchBufferSize]),
                             numCharsInData));
                     } else {
                         // In this case, buffer size is smaller, and data to be retrieved is longer
diff --git a/tests/test_004_cursor.py b/tests/test_004_cursor.py