4444 ValueLabelTypeMismatch ,
4545)
4646from pandas .util ._decorators import (
47- Appender ,
4847 doc ,
4948 set_module ,
5049)
127126 Return StataReader object for iterations, returns chunks with
128127 given number of lines."""
129128
130- _iterator_params = """\
131- iterator : bool, default False
132- Return StataReader object."""
133-
134129_reader_notes = """\
135130 Notes
136131-----
139134file is associated to an incomplete set of value labels that only
140135label a strict subset of the values."""
141136
142- _read_stata_doc = f"""
143- Read Stata file into DataFrame.
144-
145- Parameters
146- ----------
147- filepath_or_buffer : str, path object or file-like object
148- Any valid string path is acceptable. The string could be a URL. Valid
149- URL schemes include http, ftp, s3, and file. For file URLs, a host is
150- expected. A local file could be: ``file://localhost/path/to/table.dta``.
151-
152- If you want to pass in a path object, pandas accepts any ``os.PathLike``.
153-
154- By file-like object, we refer to objects with a ``read()`` method,
155- such as a file handle (e.g. via builtin ``open`` function)
156- or ``StringIO``.
157- { _statafile_processing_params1 }
158- { _statafile_processing_params2 }
159- { _chunksize_params }
160- { _iterator_params }
161- { _shared_docs ["decompression_options" ] % "filepath_or_buffer" }
162- { _shared_docs ["storage_options" ]}
163-
164- Returns
165- -------
166- DataFrame, pandas.api.typing.StataReader
167- If iterator or chunksize, returns StataReader, else DataFrame.
168-
169- See Also
170- --------
171- io.stata.StataReader : Low-level reader for Stata data files.
172- DataFrame.to_stata: Export Stata data files.
173-
174- { _reader_notes }
175-
176- Examples
177- --------
178-
179- Creating a dummy stata for this example
180-
181- >>> df = pd.DataFrame({{'animal': ['falcon', 'parrot', 'falcon', 'parrot'],
182- ... 'speed': [350, 18, 361, 15]}}) # doctest: +SKIP
183- >>> df.to_stata('animals.dta') # doctest: +SKIP
184-
185- Read a Stata dta file:
186-
187- >>> df = pd.read_stata('animals.dta') # doctest: +SKIP
188-
189- Read a Stata dta file in 10,000 line chunks:
190-
191- >>> values = np.random.randint(0, 10, size=(20_000, 1), dtype="uint8") # doctest: +SKIP
192- >>> df = pd.DataFrame(values, columns=["i"]) # doctest: +SKIP
193- >>> df.to_stata('filename.dta') # doctest: +SKIP
194-
195- >>> with pd.read_stata('filename.dta', chunksize=10000) as itr: # doctest: +SKIP
196- >>> for chunk in itr:
197- ... # Operate on a single chunk, e.g., chunk.mean()
198- ... pass # doctest: +SKIP
199- """
200-
201- _read_method_doc = f"""\
202- Reads observations from Stata file, converting them into a dataframe
203-
204- Parameters
205- ----------
206- nrows : int
207- Number of lines to read from data file, if None read whole file.
208- { _statafile_processing_params1 }
209- { _statafile_processing_params2 }
210-
211- Returns
212- -------
213- DataFrame
214- """
215-
216137_stata_reader_doc = f"""\
217138 Class for reading Stata dta files.
218139
@@ -1677,7 +1598,6 @@ def get_chunk(self, size: int | None = None) -> DataFrame:
16771598 size = self ._chunksize
16781599 return self .read (nrows = size )
16791600
1680- @Appender (_read_method_doc )
16811601 def read (
16821602 self ,
16831603 nrows : int | None = None ,
@@ -1689,6 +1609,38 @@ def read(
16891609 columns : Sequence [str ] | None = None ,
16901610 order_categoricals : bool | None = None ,
16911611 ) -> DataFrame :
1612+ """
1613+ Reads observations from Stata file, converting them into a dataframe
1614+
1615+ Parameters
1616+ ----------
1617+ nrows : int
1618+ Number of lines to read from data file, if None read whole file.
1619+ convert_dates : bool, default True
1620+ Convert date variables to DataFrame time values.
1621+ convert_categoricals : bool, default True
1622+ Read value labels and convert columns to Categorical/Factor variables.
1623+ index_col : str, optional
1624+ Column to set as index.
1625+ convert_missing : bool, default False
1626+ Flag indicating whether to convert missing values to their Stata
1627+ representations. If False, missing values are replaced with nan.
1628+ If True, columns containing missing values are returned with
1629+ object data types and missing values are represented by
1630+ StataMissingValue objects.
1631+ preserve_dtypes : bool, default True
1632+ Preserve Stata datatypes. If False, numeric data are upcast to pandas
1633+ default types for foreign data (float64 or int64).
1634+ columns : list or None
1635+ Columns to retain. Columns will be returned in the given order. None
1636+ returns all columns.
1637+ order_categoricals : bool, default True
1638+ Flag indicating whether converted categorical data are ordered.
1639+
1640+ Returns
1641+ -------
1642+ DataFrame
1643+ """
16921644 self ._ensure_open ()
16931645
16941646 # Handle options
@@ -2135,7 +2087,6 @@ def value_labels(self) -> dict[str, dict[int, str]]:
21352087
21362088
21372089@set_module ("pandas" )
2138- @Appender (_read_stata_doc )
21392090def read_stata (
21402091 filepath_or_buffer : FilePath | ReadBuffer [bytes ],
21412092 * ,
@@ -2151,6 +2102,122 @@ def read_stata(
21512102 compression : CompressionOptions = "infer" ,
21522103 storage_options : StorageOptions | None = None ,
21532104) -> DataFrame | StataReader :
2105+ """
2106+ Read Stata file into DataFrame.
2107+
2108+ Parameters
2109+ ----------
2110+ filepath_or_buffer : str, path object or file-like object
2111+ Any valid string path is acceptable. The string could be a URL. Valid
2112+ URL schemes include http, ftp, s3, and file. For file URLs, a host is
2113+ expected. A local file could be: ``file://localhost/path/to/table.dta``.
2114+
2115+ If you want to pass in a path object, pandas accepts any ``os.PathLike``.
2116+
2117+ By file-like object, we refer to objects with a ``read()`` method,
2118+ such as a file handle (e.g. via builtin ``open`` function)
2119+ or ``StringIO``.
2120+ convert_dates : bool, default True
2121+ Convert date variables to DataFrame time values.
2122+ convert_categoricals : bool, default True
2123+ Read value labels and convert columns to Categorical/Factor variables.
2124+ index_col : str, optional
2125+ Column to set as index.
2126+ convert_missing : bool, default False
2127+ Flag indicating whether to convert missing values to their Stata
2128+ representations. If False, missing values are replaced with nan.
2129+ If True, columns containing missing values are returned with
2130+ object data types and missing values are represented by
2131+ StataMissingValue objects.
2132+ preserve_dtypes : bool, default True
2133+ Preserve Stata datatypes. If False, numeric data are upcast to pandas
2134+ default types for foreign data (float64 or int64).
2135+ columns : list or None
2136+ Columns to retain. Columns will be returned in the given order. None
2137+ returns all columns.
2138+ order_categoricals : bool, default True
2139+ Flag indicating whether converted categorical data are ordered.
2140+ chunksize : int, default None
2141+ Return StataReader object for iterations, returns chunks with
2142+ given number of lines.
2143+ iterator : bool, default False
2144+ Return StataReader object.
2145+ compression : str or dict, default 'infer'
2146+ For on-the-fly decompression of on-disk data. If 'infer' and
2147+ 'filepath_or_buffer' is path-like, then detect compression from the
2148+ following extensions: '.gz', '.bz2', '.zip', '.xz', '.zst', '.tar',
2149+ '.tar.gz', '.tar.xz' or '.tar.bz2' (otherwise no compression).
2150+ If using 'zip' or 'tar', the ZIP file must contain only one
2151+ data file to be read in. Set to ``None`` for no decompression.
2152+ Can also be a dict with key ``'method'`` set to one of
2153+ {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``} and
2154+ other key-value pairs are forwarded to
2155+ ``zipfile.ZipFile``, ``gzip.GzipFile``,
2156+ ``bz2.BZ2File``, ``zstandard.ZstdDecompressor``, ``lzma.LZMAFile`` or
2157+ ``tarfile.TarFile``, respectively.
2158+ As an example, the following could be passed for Zstandard decompression using a
2159+ custom compression dictionary:
2160+ ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``.
2161+
2162+ .. versionadded:: 1.5.0
2163+ Added support for `.tar` files.
2164+ storage_options : dict, optional
2165+ Extra options that make sense for a particular storage connection, e.g.
2166+ host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
2167+ are forwarded to ``urllib.request.Request`` as header options. For other
2168+ URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are
2169+ forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more
2170+ details, and for more examples on storage options refer `here
2171+ <https://pandas.pydata.org/docs/user_guide/io.html?
2172+ highlight=storage_options#reading-writing-remote-files>`_.
2173+
2174+ Returns
2175+ -------
2176+ DataFrame, pandas.api.typing.StataReader
2177+ If iterator or chunksize, returns StataReader, else DataFrame.
2178+
2179+ See Also
2180+ --------
2181+ io.stata.StataReader : Low-level reader for Stata data files.
2182+ DataFrame.to_stata: Export Stata data files.
2183+
2184+ Notes
2185+ -----
2186+ Categorical variables read through an iterator may not have the same
2187+ categories and dtype. This occurs when a variable stored in a DTA
2188+ file is associated to an incomplete set of value labels that only
2189+ label a strict subset of the values.
2190+
2191+ Examples
2192+ --------
2193+
2194+ Creating a dummy stata for this example
2195+
2196+ >>> df = pd.DataFrame(
2197+ ... {
2198+ ... "animal": ["falcon", "parrot", "falcon", "parrot"],
2199+ ... "speed": [350, 18, 361, 15],
2200+ ... }
2201+ ... ) # doctest: +SKIP
2202+ >>> df.to_stata("animals.dta") # doctest: +SKIP
2203+
2204+ Read a Stata dta file:
2205+
2206+ >>> df = pd.read_stata("animals.dta") # doctest: +SKIP
2207+
2208+ Read a Stata dta file in 10,000 line chunks:
2209+
2210+ >>> values = np.random.randint(
2211+ ... 0, 10, size=(20_000, 1), dtype="uint8"
2212+ ... ) # doctest: +SKIP
2213+ >>> df = pd.DataFrame(values, columns=["i"]) # doctest: +SKIP
2214+ >>> df.to_stata("filename.dta") # doctest: +SKIP
2215+
2216+ >>> with pd.read_stata('filename.dta', chunksize=10000) as itr: # doctest: +SKIP
2217+ >>> for chunk in itr:
2218+ ... # Operate on a single chunk, e.g., chunk.mean()
2219+ ... pass # doctest: +SKIP
2220+ """
21542221 reader = StataReader (
21552222 filepath_or_buffer ,
21562223 convert_dates = convert_dates ,
0 commit comments