4040)
4141from pandas ._libs .lib import is_string_array
4242from pandas ._libs .tslibs import timezones
43+ from pandas .compat import HAS_PYARROW
4344from pandas .compat ._optional import import_optional_dependency
4445from pandas .compat .pickle_compat import patch_pickle
4546from pandas .errors import (
@@ -391,6 +392,13 @@ def read_hdf(
391392 DataFrame.to_hdf : Write a HDF file from a DataFrame.
392393 HDFStore : Low-level access to HDF files.
393394
395+ Notes
396+ -----
397+ When ``errors="surrogatepass"``, ``pd.options.future.infer_string`` is true,
398+ and PyArrow is installed, if a UTF-16 surrogate is encountered when decoding
399+ to UTF-8, the resulting dtype will be
400+ ``pd.StringDtype(storage="python", na_value=np.nan)``.
401+
394402 Examples
395403 --------
396404 >>> df = pd.DataFrame([[1, 1.0, 'a']], columns=['x', 'y', 'z']) # doctest: +SKIP
@@ -2182,6 +2190,20 @@ def convert(
21822190 # making an Index instance could throw a number of different errors
21832191 try :
21842192 new_pd_index = factory (values , ** kwargs )
2193+ except UnicodeEncodeError as err :
2194+ if (
2195+ errors == "surrogatepass"
2196+ and get_option ("future.infer_string" )
2197+ and str (err ).endswith ("surrogates not allowed" )
2198+ and HAS_PYARROW
2199+ ):
2200+ new_pd_index = factory (
2201+ values ,
2202+ dtype = StringDtype (storage = "python" , na_value = np .nan ),
2203+ ** kwargs ,
2204+ )
2205+ else :
2206+ raise
21852207 except ValueError :
21862208 # if the output freq is different that what we recorded,
21872209 # it should be None (see also 'doc example part 2')
@@ -3097,12 +3119,29 @@ def read_index_node(
30973119 ** kwargs ,
30983120 )
30993121 else :
3100- index = factory (
3101- _unconvert_index (
3102- data , kind , encoding = self .encoding , errors = self .errors
3103- ),
3104- ** kwargs ,
3105- )
3122+ try :
3123+ index = factory (
3124+ _unconvert_index (
3125+ data , kind , encoding = self .encoding , errors = self .errors
3126+ ),
3127+ ** kwargs ,
3128+ )
3129+ except UnicodeEncodeError as err :
3130+ if (
3131+ self .errors == "surrogatepass"
3132+ and get_option ("future.infer_string" )
3133+ and str (err ).endswith ("surrogates not allowed" )
3134+ and HAS_PYARROW
3135+ ):
3136+ index = factory (
3137+ _unconvert_index (
3138+ data , kind , encoding = self .encoding , errors = self .errors
3139+ ),
3140+ dtype = StringDtype (storage = "python" , na_value = np .nan ),
3141+ ** kwargs ,
3142+ )
3143+ else :
3144+ raise
31063145
31073146 index .name = name
31083147
@@ -3236,13 +3275,24 @@ def read(
32363275 self .validate_read (columns , where )
32373276 index = self .read_index ("index" , start = start , stop = stop )
32383277 values = self .read_array ("values" , start = start , stop = stop )
3239- result = Series (values , index = index , name = self .name , copy = False )
3240- if (
3241- using_string_dtype ()
3242- and isinstance (values , np .ndarray )
3243- and is_string_array (values , skipna = True )
3244- ):
3245- result = result .astype (StringDtype (na_value = np .nan ))
3278+ try :
3279+ result = Series (values , index = index , name = self .name , copy = False )
3280+ except UnicodeEncodeError as err :
3281+ if (
3282+ self .errors == "surrogatepass"
3283+ and get_option ("future.infer_string" )
3284+ and str (err ).endswith ("surrogates not allowed" )
3285+ and HAS_PYARROW
3286+ ):
3287+ result = Series (
3288+ values ,
3289+ index = index ,
3290+ name = self .name ,
3291+ copy = False ,
3292+ dtype = StringDtype (storage = "python" , na_value = np .nan ),
3293+ )
3294+ else :
3295+ raise
32463296 return result
32473297
32483298 def write (self , obj , ** kwargs ) -> None :
@@ -4704,7 +4754,24 @@ def read(
47044754 values = values .reshape ((1 , values .shape [0 ]))
47054755
47064756 if isinstance (values , np .ndarray ):
4707- df = DataFrame (values .T , columns = cols_ , index = index_ , copy = False )
4757+ try :
4758+ df = DataFrame (values .T , columns = cols_ , index = index_ , copy = False )
4759+ except UnicodeEncodeError as err :
4760+ if (
4761+ self .errors == "surrogatepass"
4762+ and get_option ("future.infer_string" )
4763+ and str (err ).endswith ("surrogates not allowed" )
4764+ and HAS_PYARROW
4765+ ):
4766+ df = DataFrame (
4767+ values .T ,
4768+ columns = cols_ ,
4769+ index = index_ ,
4770+ copy = False ,
4771+ dtype = StringDtype (storage = "python" , na_value = np .nan ),
4772+ )
4773+ else :
4774+ raise
47084775 elif isinstance (values , Index ):
47094776 df = DataFrame (values , columns = cols_ , index = index_ )
47104777 else :
@@ -4714,23 +4781,10 @@ def read(
47144781 assert (df .dtypes == values .dtype ).all (), (df .dtypes , values .dtype )
47154782
47164783 # If str / string dtype is stored in meta, use that.
4717- converted = False
47184784 for column in cols_ :
47194785 dtype = getattr (self .table .attrs , f"{ column } _meta" , None )
47204786 if dtype in ["str" , "string" ]:
47214787 df [column ] = df [column ].astype (dtype )
4722- converted = True
4723- # Otherwise try inference.
4724- if (
4725- not converted
4726- and using_string_dtype ()
4727- and isinstance (values , np .ndarray )
4728- and is_string_array (
4729- values ,
4730- skipna = True ,
4731- )
4732- ):
4733- df = df .astype (StringDtype (na_value = np .nan ))
47344788 frames .append (df )
47354789
47364790 if len (frames ) == 1 :
@@ -5194,7 +5248,7 @@ def _convert_string_array(data: np.ndarray, encoding: str, errors: str) -> np.nd
51945248 # encode if needed
51955249 if len (data ):
51965250 data = (
5197- Series (data .ravel (), copy = False )
5251+ Series (data .ravel (), copy = False , dtype = "object" )
51985252 .str .encode (encoding , errors )
51995253 ._values .reshape (data .shape )
52005254 )
@@ -5234,7 +5288,9 @@ def _unconvert_string_array(
52345288 dtype = f"U{ itemsize } "
52355289
52365290 if isinstance (data [0 ], bytes ):
5237- ser = Series (data , copy = False ).str .decode (encoding , errors = errors )
5291+ ser = Series (data , copy = False ).str .decode (
5292+ encoding , errors = errors , dtype = "object"
5293+ )
52385294 data = ser .to_numpy ()
52395295 data .flags .writeable = True
52405296 else :
0 commit comments