From 748ad72a0a5f4ebaa4f593cd29f97435fff4128e Mon Sep 17 00:00:00 2001 From: huhu-dsy Date: Tue, 14 Oct 2025 18:48:46 +0800 Subject: [PATCH 01/24] Update readers.py --- pandas/io/parsers/readers.py | 882 +++++++++++++++++++++++++++++++++-- 1 file changed, 850 insertions(+), 32 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 458c8dd201d0a..b651b2700a70a 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -761,21 +761,6 @@ def read_csv( ) -> DataFrame | TextFileReader: ... -@Appender( - _doc_read_csv_and_table.format( - func_name="read_csv", - summary="Read a comma-separated values (csv) file into DataFrame.", - see_also_func_name="read_table", - see_also_func_summary="Read general delimited file into DataFrame.", - na_values_str=fill( - '", "'.join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" " - ), - _default_sep="','", - storage_options=_shared_docs["storage_options"], - decompression_options=_shared_docs["decompression_options"] - % "filepath_or_buffer", - ) -) @set_module("pandas") def read_csv( filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], @@ -834,6 +819,431 @@ def read_csv( storage_options: StorageOptions | None = None, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, ) -> DataFrame | TextFileReader: + """Read a comma-separated values (csv) file into DataFrame. + + Also supports optionally iterating or breaking of the file + into chunks. + + Additional help can be found in the online docs for + `IO Tools `_. + + Parameters + ---------- + filepath_or_buffer : str, path object or file-like object + Any valid string path is acceptable. The string could be a URL. Valid + URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is + expected. A local file could be: file://localhost/path/to/table.csv. + + If you want to pass in a path object, pandas accepts any ``os.PathLike``. + + By file-like object, we refer to objects with a ``read()`` method, such as + a file handle (e.g. via builtin ``open`` function) or ``StringIO``. + sep : str, default ',' + Character or regex pattern to treat as the delimiter. If ``sep=None``, the + C engine cannot automatically detect + the separator, but the Python parsing engine can, meaning the latter will + be used and automatically detect the separator from only the first valid + row of the file by Python's builtin sniffer tool, ``csv.Sniffer``. + In addition, separators longer than 1 character and different from + ``'\s+'`` will be interpreted as regular expressions and will also force + the use of the Python parsing engine. Note that regex delimiters are prone + to ignoring quoted data. Regex example: ``'\r\t'``. + delimiter : str, optional + Alias for ``sep``. + header : int, Sequence of int, 'infer' or None, default 'infer' + Row number(s) containing column labels and marking the start of the + data (zero-indexed). Default behavior is to infer the column names: if no ``names`` + are passed the behavior is identical to ``header=0`` and column + names are inferred from the first line of the file, if column + names are passed explicitly to ``names`` then the behavior is identical to + ``header=None``. Explicitly pass ``header=0`` to be able to + replace existing names. The header can be a list of integers that + specify row locations for a :class:`~pandas.MultiIndex` on the columns + e.g. ``[0, 1, 3]``. Intervening rows that are not specified will be + skipped (e.g. 2 in this example is skipped). Note that this + parameter ignores commented lines and empty lines if + ``skip_blank_lines=True``, so ``header=0`` denotes the first line of + data rather than the first line of the file. + + When inferred from the file contents, headers are kept distinct from + each other by renaming duplicate names with a numeric suffix of the form + ``".{count}"`` starting from 1, e.g. ``"foo"`` and ``"foo.1"``. + Empty headers are named ``"Unnamed: {i}"`` or ``"Unnamed: {i}_level_{level}"`` + in the case of MultiIndex columns. + names : Sequence of Hashable, optional + Sequence of column labels to apply. If the file contains a header row, + then you should explicitly pass ``header=0`` to override the column names. + Duplicates in this list are not allowed. + index_col : Hashable, Sequence of Hashable or False, optional + Column(s) to use as row label(s), denoted either by column labels or column + indices. If a sequence of labels or indices is given, :class:`~pandas.MultiIndex` + will be formed for the row labels. + + Note: ``index_col=False`` can be used to force pandas to *not* use the first + column as the index, e.g., when you have a malformed file with delimiters at + the end of each line. + usecols : Sequence of Hashable or Callable, optional + Subset of columns to select, denoted either by column labels or column indices. + If list-like, all elements must either + be positional (i.e. integer indices into the document columns) or strings + that correspond to column names provided either by the user in ``names`` or + inferred from the document header row(s). If ``names`` are given, the document + header row(s) are not taken into account. For example, a valid list-like + ``usecols`` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``. + Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. + To instantiate a :class:`~pandas.DataFrame` from ``data`` with element order + preserved use ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` + for columns in ``['foo', 'bar']`` order or + ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]`` + for ``['bar', 'foo']`` order. + + If callable, the callable function will be evaluated against the column + names, returning names where the callable function evaluates to ``True``. An + example of a valid callable argument would be ``lambda x: x.upper() in + ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster + parsing time and lower memory usage. + dtype : dtype or dict of {Hashable : dtype}, optional + Data type(s) to apply to either the whole dataset or individual columns. + E.g., ``{'a': np.float64, 'b': np.int32, 'c': 'Int64'}`` + Use ``str`` or ``object`` together with suitable ``na_values`` settings + to preserve and not interpret ``dtype``. + If ``converters`` are specified, they will be applied INSTEAD + of ``dtype`` conversion. + + .. versionadded:: 1.5.0 + Support for ``defaultdict`` was added. Specify a ``defaultdict`` as input where + the default determines the ``dtype`` of the columns which are not explicitly + listed. + engine : {'c', 'python', 'pyarrow'}, optional + Parser engine to use. The C and pyarrow engines are faster, while the python engine + is currently more feature-complete. Multithreading is currently only supported by + the pyarrow engine. + + .. versionadded:: 1.4.0 + The 'pyarrow' engine was added as an *experimental* engine, and some features + are unsupported, or may not work correctly, with this engine. + converters : dict of {Hashable : Callable}, optional + Functions for converting values in specified columns. Keys can either + be column labels or column indices. + true_values : list, optional + Values to consider as ``True`` in addition to case-insensitive variants of 'True'. + false_values : list, optional + Values to consider as ``False`` in addition to case-insensitive variants of 'False'. + skipinitialspace : bool, default False + Skip spaces after delimiter. + skiprows : int, list of int or Callable, optional + Line numbers to skip (0-indexed) or number of lines to skip (``int``) + at the start of the file. + + If callable, the callable function will be evaluated against the row + indices, returning ``True`` if the row should be skipped and ``False`` otherwise. + An example of a valid callable argument would be ``lambda x: x in [0, 2]``. + skipfooter : int, default 0 + Number of lines at bottom of file to skip (Unsupported with ``engine='c'``). + nrows : int, optional + Number of rows of file to read. Useful for reading pieces of large files. + Refers to the number of data rows in the returned DataFrame, excluding: + + * The header row containing column names. + * Rows before the header row, if ``header=1`` or larger. + + Example usage: + + * To read the first 999,999 (non-header) rows: + ``read_csv(..., nrows=999999)`` + + * To read rows 1,000,000 through 1,999,999: + ``read_csv(..., skiprows=1000000, nrows=999999)`` + na_values : Hashable, Iterable of Hashable or dict of {Hashable : Iterable}, optional + Additional strings to recognize as ``NA``/``NaN``. If ``dict`` passed, specific + per-column ``NA`` values. By default the following values are interpreted as + ``NaN``: "", "#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN", + "-nan", "1.#IND", "1.#QNAN", "", "N/A", "NA", "NULL", "NaN", + "None", "n/a", "nan", "null". + keep_default_na : bool, default True + Whether or not to include the default ``NaN`` values when parsing the data. + Depending on whether ``na_values`` is passed in, the behavior is as follows: + + * If ``keep_default_na`` is ``True``, and ``na_values`` are specified, ``na_values`` + is appended to the default ``NaN`` values used for parsing. + * If ``keep_default_na`` is ``True``, and ``na_values`` are not specified, only + the default ``NaN`` values are used for parsing. + * If ``keep_default_na`` is ``False``, and ``na_values`` are specified, only + the ``NaN`` values specified ``na_values`` are used for parsing. + * If ``keep_default_na`` is ``False``, and ``na_values`` are not specified, no + strings will be parsed as ``NaN``. + + Note that if ``na_filter`` is passed in as ``False``, the ``keep_default_na`` and + ``na_values`` parameters will be ignored. + na_filter : bool, default True + Detect missing value markers (empty strings and the value of ``na_values``). In + data without any ``NA`` values, passing ``na_filter=False`` can improve the + performance of reading a large file. + skip_blank_lines : bool, default True + If ``True``, skip over blank lines rather than interpreting as ``NaN`` values. + parse_dates : bool, None, list of Hashable, default None + The behavior is as follows: + + * ``bool``. If ``True`` -> try parsing the index. + * ``None``. Behaves like ``True`` if ``date_format`` is specified. + * ``list`` of ``int`` or names. e.g. If ``[1, 2, 3]`` -> try parsing columns 1, 2, 3 + each as a separate date column. + + If a column or index cannot be represented as an array of ``datetime``, + say because of an unparsable value or a mixture of timezones, the column + or index will be returned unaltered as an ``object`` data type. For + non-standard ``datetime`` parsing, use :func:`~pandas.to_datetime` after + :func:`~pandas.read_csv`. + + Note: A fast-path exists for iso8601-formatted dates. + date_format : str or dict of column -> format, optional + Format to use for parsing dates and/or times when used in conjunction with ``parse_dates``. + The strftime to parse time, e.g. :const:`"%d/%m/%Y"`. See + `strftime documentation + `_ for more information on choices, though + note that :const:`"%f"` will parse all the way up to nanoseconds. + You can also pass: + + - "ISO8601", to parse any `ISO8601 `_ + time string (not necessarily in exactly the same format); + - "mixed", to infer the format for each element individually. This is risky, + and you should probably use it along with `dayfirst`. + + .. versionadded:: 2.0.0 + dayfirst : bool, default False + DD/MM format dates, international and European format. + cache_dates : bool, default True + If ``True``, use a cache of unique, converted dates to apply the ``datetime`` + conversion. May produce significant speed-up when parsing duplicate + date strings, especially ones with timezone offsets. + iterator : bool, default False + Return ``TextFileReader`` object for iteration or getting chunks with + ``get_chunk()``. + chunksize : int, optional + Number of lines to read from the file per chunk. Passing a value will cause the + function to return a ``TextFileReader`` object for iteration. + See the `IO Tools docs + `_ + for more information on ``iterator`` and ``chunksize``. + compression : str or dict, default 'infer' + For on-the-fly decompression of on-disk data. If 'infer' and 'filepath_or_buffer' is + path-like, then detect compression from the following extensions: '.gz', + '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2' + (otherwise no compression). + If using 'zip' or 'tar', the ZIP file must contain only one data file to be read in. + Set to ``None`` for no decompression. + Can also be a dict with key ``'method'`` set + to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``} and + other key-value pairs are forwarded to + ``zipfile.ZipFile``, ``gzip.GzipFile``, + ``bz2.BZ2File``, ``zstandard.ZstdDecompressor``, ``lzma.LZMAFile`` or + ``tarfile.TarFile``, respectively. + As an example, the following could be passed for Zstandard decompression using a + custom compression dictionary: + ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``. + + .. versionadded:: 1.5.0 + Added support for `.tar` files. + + .. versionchanged:: 1.4.0 Zstandard support. + thousands : str (length 1), optional + Character acting as the thousands separator in numerical values. + decimal : str (length 1), default '.' + Character to recognize as decimal point (e.g., use ',' for European data). + lineterminator : str (length 1), optional + Character used to denote a line break. Only valid with C parser. + quotechar : str (length 1), optional + Character used to denote the start and end of a quoted item. Quoted + items can include the ``delimiter`` and it will be ignored. + quoting : {0 or csv.QUOTE_MINIMAL, 1 or csv.QUOTE_ALL, 2 or csv.QUOTE_NONNUMERIC, 3 or csv.QUOTE_NONE}, default csv.QUOTE_MINIMAL + Control field quoting behavior per ``csv.QUOTE_*`` constants. Default is + ``csv.QUOTE_MINIMAL`` (i.e., 0) which implies that only fields containing special + characters are quoted (e.g., characters defined in ``quotechar``, ``delimiter``, + or ``lineterminator``. + doublequote : bool, default True + When ``quotechar`` is specified and ``quoting`` is not ``QUOTE_NONE``, indicate + whether or not to interpret two consecutive ``quotechar`` elements INSIDE a + field as a single ``quotechar`` element. + escapechar : str (length 1), optional + Character used to escape other characters. + comment : str (length 1), optional + Character indicating that the remainder of line should not be parsed. + If found at the beginning + of a line, the line will be ignored altogether. This parameter must be a + single character. Like empty lines (as long as ``skip_blank_lines=True``), + fully commented lines are ignored by the parameter ``header`` but not by + ``skiprows``. For example, if ``comment='#'``, parsing + ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in ``'a,b,c'`` being + treated as the header. + encoding : str, optional, default 'utf-8' + Encoding to use for UTF when reading/writing (ex. ``'utf-8'``). `List of Python + standard encodings + `_ . + encoding_errors : str, optional, default 'strict' + How encoding errors are treated. `List of possible values + `_ . + + .. versionadded:: 1.3.0 + dialect : str or csv.Dialect, optional + If provided, this parameter will override values (default or not) for the + following parameters: ``delimiter``, ``doublequote``, ``escapechar``, + ``skipinitialspace``, ``quotechar``, and ``quoting``. If it is necessary to + override values, a ``ParserWarning`` will be issued. See ``csv.Dialect`` + documentation for more details. + on_bad_lines : {'error', 'warn', 'skip'} or Callable, default 'error' + Specifies what to do upon encountering a bad line (a line with too many fields). + Allowed values are: + + - ``'error'``, raise an Exception when a bad line is encountered. + - ``'warn'``, raise a warning when a bad line is encountered and skip that line. + - ``'skip'``, skip bad lines without raising or warning when they are encountered. + - Callable, function that will process a single bad line. + - With ``engine='python'``, function with signature + ``(bad_line: list[str]) -> list[str] | None``. + ``bad_line`` is a list of strings split by the ``sep``. + If the function returns ``None``, the bad line will be ignored. + If the function returns a new ``list`` of strings with more elements than + expected, a ``ParserWarning`` will be emitted while dropping extra elements. + - With ``engine='pyarrow'``, function with signature + as described in pyarrow documentation: `invalid_row_handler + `_. + + .. versionadded:: 1.3.0 + + .. versionadded:: 1.4.0 + Callable + + .. versionchanged:: 2.2.0 + Callable for ``engine='pyarrow'`` + low_memory : bool, default True + Internally process the file in chunks, resulting in lower memory use + while parsing, but possibly mixed type inference. To ensure no mixed + types either set ``False``, or specify the type with the ``dtype`` parameter. + Note that the entire file is read into a single :class:`~pandas.DataFrame` + regardless, use the ``chunksize`` or ``iterator`` parameter to return the data in + chunks. (Only valid with C parser). + memory_map : bool, default False + If a filepath is provided for ``filepath_or_buffer``, map the file object + directly onto memory and access the data directly from there. Using this + option can improve performance because there is no longer any I/O overhead. + float_precision : {'high', 'legacy', 'round_trip'}, optional + Specifies which converter the C engine should use for floating-point + values. The options are ``None`` or ``'high'`` for the ordinary converter, + ``'legacy'`` for the original lower precision pandas converter, and + ``'round_trip'`` for the round-trip converter. + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc. For HTTP(S) URLs the key-value pairs + are forwarded to ``urllib.request.Request`` as header options. For other + URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are + forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more + details, and for more examples on storage options refer `here + `_. + dtype_backend : {'numpy_nullable', 'pyarrow'} + Back-end data type applied to the resultant :class:`DataFrame` + (still experimental). If not specified, the default behavior + is to not use nullable data types. If specified, the behavior + is as follows: + + * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` + * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` :class:`DataFrame` + + .. versionadded:: 2.0 + + Returns + ------- + DataFrame or TextFileReader + A comma-separated values (csv) file is returned as two-dimensional + data structure with labeled axes. + + See Also + -------- + DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file. + read_table : Read general delimited file into DataFrame. + read_fwf : Read a table of fixed-width formatted lines into DataFrame. + + Examples + -------- + >>> pd.read_csv('data.csv') # doctest: +SKIP + Name Value + 0 foo 1 + 1 bar 2 + 2 #baz 3 + + Index and header can be specified via the `index_col` and `header` arguments. + + >>> pd.read_csv('data.csv', header=None) # doctest: +SKIP + 0 1 + 0 Name Value + 1 foo 1 + 2 bar 2 + 3 #baz 3 + + >>> pd.read_csv('data.csv', index_col='Value') # doctest: +SKIP + Name + Value + 1 foo + 2 bar + 3 #baz + + Column types are inferred but can be explicitly specified using the dtype argument. + + >>> pd.read_csv('data.csv', dtype={'Value': float}) # doctest: +SKIP + Name Value + 0 foo 1.0 + 1 bar 2.0 + 2 #baz 3.0 + + True, False, and NA values, and thousands separators have defaults, + but can be explicitly specified, too. Supply the values you would like + as strings or lists of strings! + + >>> pd.read_csv('data.csv', na_values=['foo', 'bar']) # doctest: +SKIP + Name Value + 0 NaN 1 + 1 NaN 2 + 2 #baz 3 + + Comment lines in the input file can be skipped using the `comment` argument. + + >>> pd.read_csv('data.csv', comment='#') # doctest: +SKIP + Name Value + 0 foo 1 + 1 bar 2 + + By default, columns with dates will be read as ``object`` rather than ``datetime``. + + >>> df = pd.read_csv('tmp.csv') # doctest: +SKIP + + >>> df # doctest: +SKIP + col 1 col 2 col 3 + 0 10 10/04/2018 Sun 15 Jan 2023 + 1 20 15/04/2018 Fri 12 May 2023 + + >>> df.dtypes # doctest: +SKIP + col 1 int64 + col 2 object + col 3 object + dtype: object + + Specific columns can be parsed as dates by using the `parse_dates` and + `date_format` arguments. + + >>> df = pd.read_csv( + ... 'tmp.csv', + ... parse_dates=[1, 2], + ... date_format={'col 2': '%d/%m/%Y', 'col 3': '%a %d %b %Y'}, + ... ) # doctest: +SKIP + + >>> df.dtypes # doctest: +SKIP + col 1 int64 + col 2 datetime64[ns] + col 3 datetime64[ns] + dtype: object + """ # locals() should never be modified kwds = locals().copy() del kwds["filepath_or_buffer"] @@ -894,23 +1304,6 @@ def read_table( ) -> DataFrame | TextFileReader: ... -@Appender( - _doc_read_csv_and_table.format( - func_name="read_table", - summary="Read general delimited file into DataFrame.", - see_also_func_name="read_csv", - see_also_func_summary=( - "Read a comma-separated values (csv) file into DataFrame." - ), - na_values_str=fill( - '", "'.join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" " - ), - _default_sep=r"'\\t' (tab-stop)", - storage_options=_shared_docs["storage_options"], - decompression_options=_shared_docs["decompression_options"] - % "filepath_or_buffer", - ) -) @set_module("pandas") def read_table( filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], @@ -969,6 +1362,431 @@ def read_table( storage_options: StorageOptions | None = None, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, ) -> DataFrame | TextFileReader: + """Read general delimited file into DataFrame. + + Also supports optionally iterating or breaking of the file + into chunks. + + Additional help can be found in the online docs for + `IO Tools `_. + + Parameters + ---------- + filepath_or_buffer : str, path object or file-like object + Any valid string path is acceptable. The string could be a URL. Valid + URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is + expected. A local file could be: file://localhost/path/to/table.csv. + + If you want to pass in a path object, pandas accepts any ``os.PathLike``. + + By file-like object, we refer to objects with a ``read()`` method, such as + a file handle (e.g. via builtin ``open`` function) or ``StringIO``. + sep : str, default '\\t' (tab-stop) + Character or regex pattern to treat as the delimiter. If ``sep=None``, the + C engine cannot automatically detect + the separator, but the Python parsing engine can, meaning the latter will + be used and automatically detect the separator from only the first valid + row of the file by Python's builtin sniffer tool, ``csv.Sniffer``. + In addition, separators longer than 1 character and different from + ``'\s+'`` will be interpreted as regular expressions and will also force + the use of the Python parsing engine. Note that regex delimiters are prone + to ignoring quoted data. Regex example: ``'\r\t'``. + delimiter : str, optional + Alias for ``sep``. + header : int, Sequence of int, 'infer' or None, default 'infer' + Row number(s) containing column labels and marking the start of the + data (zero-indexed). Default behavior is to infer the column names: if no ``names`` + are passed the behavior is identical to ``header=0`` and column + names are inferred from the first line of the file, if column + names are passed explicitly to ``names`` then the behavior is identical to + ``header=None``. Explicitly pass ``header=0`` to be able to + replace existing names. The header can be a list of integers that + specify row locations for a :class:`~pandas.MultiIndex` on the columns + e.g. ``[0, 1, 3]``. Intervening rows that are not specified will be + skipped (e.g. 2 in this example is skipped). Note that this + parameter ignores commented lines and empty lines if + ``skip_blank_lines=True``, so ``header=0`` denotes the first line of + data rather than the first line of the file. + + When inferred from the file contents, headers are kept distinct from + each other by renaming duplicate names with a numeric suffix of the form + ``".{count}"`` starting from 1, e.g. ``"foo"`` and ``"foo.1"``. + Empty headers are named ``"Unnamed: {i}"`` or ``"Unnamed: {i}_level_{level}"`` + in the case of MultiIndex columns. + names : Sequence of Hashable, optional + Sequence of column labels to apply. If the file contains a header row, + then you should explicitly pass ``header=0`` to override the column names. + Duplicates in this list are not allowed. + index_col : Hashable, Sequence of Hashable or False, optional + Column(s) to use as row label(s), denoted either by column labels or column + indices. If a sequence of labels or indices is given, :class:`~pandas.MultiIndex` + will be formed for the row labels. + + Note: ``index_col=False`` can be used to force pandas to *not* use the first + column as the index, e.g., when you have a malformed file with delimiters at + the end of each line. + usecols : Sequence of Hashable or Callable, optional + Subset of columns to select, denoted either by column labels or column indices. + If list-like, all elements must either + be positional (i.e. integer indices into the document columns) or strings + that correspond to column names provided either by the user in ``names`` or + inferred from the document header row(s). If ``names`` are given, the document + header row(s) are not taken into account. For example, a valid list-like + ``usecols`` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``. + Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. + To instantiate a :class:`~pandas.DataFrame` from ``data`` with element order + preserved use ``pd.read_table(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` + for columns in ``['foo', 'bar']`` order or + ``pd.read_table(data, usecols=['foo', 'bar'])[['bar', 'foo']]`` + for ``['bar', 'foo']`` order. + + If callable, the callable function will be evaluated against the column + names, returning names where the callable function evaluates to ``True``. An + example of a valid callable argument would be ``lambda x: x.upper() in + ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster + parsing time and lower memory usage. + dtype : dtype or dict of {Hashable : dtype}, optional + Data type(s) to apply to either the whole dataset or individual columns. + E.g., ``{'a': np.float64, 'b': np.int32, 'c': 'Int64'}`` + Use ``str`` or ``object`` together with suitable ``na_values`` settings + to preserve and not interpret ``dtype``. + If ``converters`` are specified, they will be applied INSTEAD + of ``dtype`` conversion. + + .. versionadded:: 1.5.0 + Support for ``defaultdict`` was added. Specify a ``defaultdict`` as input where + the default determines the ``dtype`` of the columns which are not explicitly + listed. + engine : {'c', 'python', 'pyarrow'}, optional + Parser engine to use. The C and pyarrow engines are faster, while the python engine + is currently more feature-complete. Multithreading is currently only supported by + the pyarrow engine. + + .. versionadded:: 1.4.0 + The 'pyarrow' engine was added as an *experimental* engine, and some features + are unsupported, or may not work correctly, with this engine. + converters : dict of {Hashable : Callable}, optional + Functions for converting values in specified columns. Keys can either + be column labels or column indices. + true_values : list, optional + Values to consider as ``True`` in addition to case-insensitive variants of 'True'. + false_values : list, optional + Values to consider as ``False`` in addition to case-insensitive variants of 'False'. + skipinitialspace : bool, default False + Skip spaces after delimiter. + skiprows : int, list of int or Callable, optional + Line numbers to skip (0-indexed) or number of lines to skip (``int``) + at the start of the file. + + If callable, the callable function will be evaluated against the row + indices, returning ``True`` if the row should be skipped and ``False`` otherwise. + An example of a valid callable argument would be ``lambda x: x in [0, 2]``. + skipfooter : int, default 0 + Number of lines at bottom of file to skip (Unsupported with ``engine='c'``). + nrows : int, optional + Number of rows of file to read. Useful for reading pieces of large files. + Refers to the number of data rows in the returned DataFrame, excluding: + + * The header row containing column names. + * Rows before the header row, if ``header=1`` or larger. + + Example usage: + + * To read the first 999,999 (non-header) rows: + ``read_table(..., nrows=999999)`` + + * To read rows 1,000,000 through 1,999,999: + ``read_table(..., skiprows=1000000, nrows=999999)`` + na_values : Hashable, Iterable of Hashable or dict of {Hashable : Iterable}, optional + Additional strings to recognize as ``NA``/``NaN``. If ``dict`` passed, specific + per-column ``NA`` values. By default the following values are interpreted as + ``NaN``: "", "#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN", + "-nan", "1.#IND", "1.#QNAN", "", "N/A", "NA", "NULL", "NaN", + "None", "n/a", "nan", "null". + keep_default_na : bool, default True + Whether or not to include the default ``NaN`` values when parsing the data. + Depending on whether ``na_values`` is passed in, the behavior is as follows: + + * If ``keep_default_na`` is ``True``, and ``na_values`` are specified, ``na_values`` + is appended to the default ``NaN`` values used for parsing. + * If ``keep_default_na`` is ``True``, and ``na_values`` are not specified, only + the default ``NaN`` values are used for parsing. + * If ``keep_default_na`` is ``False``, and ``na_values`` are specified, only + the ``NaN`` values specified ``na_values`` are used for parsing. + * If ``keep_default_na`` is ``False``, and ``na_values`` are not specified, no + strings will be parsed as ``NaN``. + + Note that if ``na_filter`` is passed in as ``False``, the ``keep_default_na`` and + ``na_values`` parameters will be ignored. + na_filter : bool, default True + Detect missing value markers (empty strings and the value of ``na_values``). In + data without any ``NA`` values, passing ``na_filter=False`` can improve the + performance of reading a large file. + skip_blank_lines : bool, default True + If ``True``, skip over blank lines rather than interpreting as ``NaN`` values. + parse_dates : bool, None, list of Hashable, default None + The behavior is as follows: + + * ``bool``. If ``True`` -> try parsing the index. + * ``None``. Behaves like ``True`` if ``date_format`` is specified. + * ``list`` of ``int`` or names. e.g. If ``[1, 2, 3]`` -> try parsing columns 1, 2, 3 + each as a separate date column. + + If a column or index cannot be represented as an array of ``datetime``, + say because of an unparsable value or a mixture of timezones, the column + or index will be returned unaltered as an ``object`` data type. For + non-standard ``datetime`` parsing, use :func:`~pandas.to_datetime` after + :func:`~pandas.read_table`. + + Note: A fast-path exists for iso8601-formatted dates. + date_format : str or dict of column -> format, optional + Format to use for parsing dates and/or times when used in conjunction with ``parse_dates``. + The strftime to parse time, e.g. :const:`"%d/%m/%Y"`. See + `strftime documentation + `_ for more information on choices, though + note that :const:`"%f"`` will parse all the way up to nanoseconds. + You can also pass: + + - "ISO8601", to parse any `ISO8601 `_ + time string (not necessarily in exactly the same format); + - "mixed", to infer the format for each element individually. This is risky, + and you should probably use it along with `dayfirst`. + + .. versionadded:: 2.0.0 + dayfirst : bool, default False + DD/MM format dates, international and European format. + cache_dates : bool, default True + If ``True``, use a cache of unique, converted dates to apply the ``datetime`` + conversion. May produce significant speed-up when parsing duplicate + date strings, especially ones with timezone offsets. + iterator : bool, default False + Return ``TextFileReader`` object for iteration or getting chunks with + ``get_chunk()``. + chunksize : int, optional + Number of lines to read from the file per chunk. Passing a value will cause the + function to return a ``TextFileReader`` object for iteration. + See the `IO Tools docs + `_ + for more information on ``iterator`` and ``chunksize``. + compression : str or dict, default 'infer' + For on-the-fly decompression of on-disk data. If 'infer' and 'filepath_or_buffer' is + path-like, then detect compression from the following extensions: '.gz', + '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2' + (otherwise no compression). + If using 'zip' or 'tar', the ZIP file must contain only one data file to be read in. + Set to ``None`` for no decompression. + Can also be a dict with key ``'method'`` set + to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``} and + other key-value pairs are forwarded to + ``zipfile.ZipFile``, ``gzip.GzipFile``, + ``bz2.BZ2File``, ``zstandard.ZstdDecompressor``, ``lzma.LZMAFile`` or + ``tarfile.TarFile``, respectively. + As an example, the following could be passed for Zstandard decompression using a + custom compression dictionary: + ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``. + + .. versionadded:: 1.5.0 + Added support for `.tar` files. + + .. versionchanged:: 1.4.0 Zstandard support. + thousands : str (length 1), optional + Character acting as the thousands separator in numerical values. + decimal : str (length 1), default '.' + Character to recognize as decimal point (e.g., use ',' for European data). + lineterminator : str (length 1), optional + Character used to denote a line break. Only valid with C parser. + quotechar : str (length 1), default '"' + Character used to denote the start and end of a quoted item. Quoted + items can include the ``delimiter`` and it will be ignored. + quoting : {0 or csv.QUOTE_MINIMAL, 1 or csv.QUOTE_ALL, 2 or csv.QUOTE_NONNUMERIC, 3 or csv.QUOTE_NONE}, default csv.QUOTE_MINIMAL + Control field quoting behavior per ``csv.QUOTE_*`` constants. Default is + ``csv.QUOTE_MINIMAL`` (i.e., 0) which implies that only fields containing special + characters are quoted (e.g., characters defined in ``quotechar``, ``delimiter``, + or ``lineterminator``. + doublequote : bool, default True + When ``quotechar`` is specified and ``quoting`` is not ``QUOTE_NONE``, indicate + whether or not to interpret two consecutive ``quotechar`` elements INSIDE a + field as a single ``quotechar`` element. + escapechar : str (length 1), optional + Character used to escape other characters. + comment : str (length 1), optional + Character indicating that the remainder of line should not be parsed. + If found at the beginning + of a line, the line will be ignored altogether. This parameter must be a + single character. Like empty lines (as long as ``skip_blank_lines=True``), + fully commented lines are ignored by the parameter ``header`` but not by + ``skiprows``. For example, if ``comment='#'``, parsing + ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in ``'a,b,c'`` being + treated as the header. + encoding : str, optional, default 'utf-8' + Encoding to use for UTF when reading/writing (ex. ``'utf-8'``). `List of Python + standard encodings + `_ . + encoding_errors : str, optional, default 'strict' + How encoding errors are treated. `List of possible values + `_ . + + .. versionadded:: 1.3.0 + dialect : str or csv.Dialect, optional + If provided, this parameter will override values (default or not) for the + following parameters: ``delimiter``, ``doublequote``, ``escapechar``, + ``skipinitialspace``, ``quotechar``, and ``quoting``. If it is necessary to + override values, a ``ParserWarning`` will be issued. See ``csv.Dialect`` + documentation for more details. + on_bad_lines : {'error', 'warn', 'skip'} or Callable, default 'error' + Specifies what to do upon encountering a bad line (a line with too many fields). + Allowed values are: + + - ``'error'``, raise an Exception when a bad line is encountered. + - ``'warn'``, raise a warning when a bad line is encountered and skip that line. + - ``'skip'``, skip bad lines without raising or warning when they are encountered. + - Callable, function that will process a single bad line. + - With ``engine='python'``, function with signature + ``(bad_line: list[str]) -> list[str] | None``. + ``bad_line`` is a list of strings split by the ``sep``. + If the function returns ``None``, the bad line will be ignored. + If the function returns a new ``list`` of strings with more elements than + expected, a ``ParserWarning`` will be emitted while dropping extra elements. + - With ``engine='pyarrow'``, function with signature + as described in pyarrow documentation: `invalid_row_handler + `_. + + .. versionadded:: 1.3.0 + + .. versionadded:: 1.4.0 + Callable + + .. versionchanged:: 2.2.0 + Callable for ``engine='pyarrow'`` + low_memory : bool, default True + Internally process the file in chunks, resulting in lower memory use + while parsing, but possibly mixed type inference. To ensure no mixed + types either set ``False``, or specify the type with the ``dtype`` parameter. + Note that the entire file is read into a single :class:`~pandas.DataFrame` + regardless, use the ``chunksize`` or ``iterator`` parameter to return the data in + chunks. (Only valid with C parser). + memory_map : bool, default False + If a filepath is provided for ``filepath_or_buffer``, map the file object + directly onto memory and access the data directly from there. Using this + option can improve performance because there is no longer any I/O overhead. + float_precision : {'high', 'legacy', 'round_trip'}, optional + Specifies which converter the C engine should use for floating-point + values. The options are ``None`` or ``'high'`` for the ordinary converter, + ``'legacy'`` for the original lower precision pandas converter, and + ``'round_trip'`` for the round-trip converter. + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc. For HTTP(S) URLs the key-value pairs + are forwarded to ``urllib.request.Request`` as header options. For other + URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are + forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more + details, and for more examples on storage options refer `here + `_. + dtype_backend : {'numpy_nullable', 'pyarrow'} + Back-end data type applied to the resultant :class:`DataFrame` + (still experimental). If not specified, the default behavior + is to not use nullable data types. If specified, the behavior + is as follows: + + * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` + * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` :class:`DataFrame` + + .. versionadded:: 2.0 + + Returns + ------- + DataFrame or TextFileReader + A general delimited file is returned as two-dimensional + data structure with labeled axes. + + See Also + -------- + DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file. + read_csv : Read a comma-separated values (csv) file into DataFrame. + read_fwf : Read a table of fixed-width formatted lines into DataFrame. + + Examples + -------- + >>> pd.read_table('data.txt') # doctest: +SKIP + Name Value + 0 foo 1 + 1 bar 2 + 2 #baz 3 + + Index and header can be specified via the `index_col` and `header` arguments. + + >>> pd.read_table('data.txt', header=None) # doctest: +SKIP + 0 1 + 0 Name Value + 1 foo 1 + 2 bar 2 + 3 #baz 3 + + >>> pd.read_table('data.txt', index_col='Value') # doctest: +SKIP + Name + Value + 1 foo + 2 bar + 3 #baz + + Column types are inferred but can be explicitly specified using the dtype argument. + + >>> pd.read_table('data.txt', dtype={'Value': float}) # doctest: +SKIP + Name Value + 0 foo 1.0 + 1 bar 2.0 + 2 #baz 3.0 + + True, False, and NA values, and thousands separators have defaults, + but can be explicitly specified, too. Supply the values you would like + as strings or lists of strings! + + >>> pd.read_table('data.txt', na_values=['foo', 'bar']) # doctest: +SKIP + Name Value + 0 NaN 1 + 1 NaN 2 + 2 #baz 3 + + Comment lines in the input file can be skipped using the `comment` argument. + + >>> pd.read_table('data.txt', comment='#') # doctest: +SKIP + Name Value + 0 foo 1 + 1 bar 2 + + By default, columns with dates will be read as ``object`` rather than ``datetime``. + + >>> df = pd.read_table('tmp.txt') # doctest: +SKIP + + >>> df # doctest: +SKIP + col 1 col 2 col 3 + 0 10 10/04/2018 Sun 15 Jan 2023 + 1 20 15/04/2018 Fri 12 May 2023 + + >>> df.dtypes # doctest: +SKIP + col 1 int64 + col 2 object + col 3 object + dtype: object + + Specific columns can be parsed as dates by using the `parse_dates` and + `date_format` arguments. + + >>> df = pd.read_table( + ... 'tmp.txt', + ... parse_dates=[1, 2], + ... date_format={'col 2': '%d/%m/%Y', 'col 3': '%a %d %b %Y'}, + ... ) # doctest: +SKIP + + >>> df.dtypes # doctest: +SKIP + col 1 int64 + col 2 datetime64[ns] + col 3 datetime64[ns] + dtype: object + """ # locals() should never be modified kwds = locals().copy() del kwds["filepath_or_buffer"] From 54f97146d41553a4067785d62540b6be6c168770 Mon Sep 17 00:00:00 2001 From: huhu-dsy Date: Wed, 15 Oct 2025 18:29:17 +0800 Subject: [PATCH 02/24] Update readers.py --- pandas/io/parsers/readers.py | 622 +++++++++++++++++++---------------- 1 file changed, 341 insertions(+), 281 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index b651b2700a70a..f647714f66b3d 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -12,7 +12,6 @@ ) import csv import sys -from textwrap import fill from typing import ( IO, TYPE_CHECKING, @@ -36,7 +35,6 @@ ParserWarning, ) from pandas.util._decorators import ( - Appender, set_module, ) from pandas.util._exceptions import find_stack_level @@ -53,7 +51,6 @@ from pandas import Series from pandas.core.frame import DataFrame from pandas.core.indexes.api import RangeIndex -from pandas.core.shared_docs import _shared_docs from pandas.io.common import ( IOHandles, @@ -819,7 +816,8 @@ def read_csv( storage_options: StorageOptions | None = None, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, ) -> DataFrame | TextFileReader: - """Read a comma-separated values (csv) file into DataFrame. + r""" + Read a comma-separated values (csv) file into DataFrame. Also supports optionally iterating or breaking of the file into chunks. @@ -852,7 +850,8 @@ def read_csv( Alias for ``sep``. header : int, Sequence of int, 'infer' or None, default 'infer' Row number(s) containing column labels and marking the start of the - data (zero-indexed). Default behavior is to infer the column names: if no ``names`` + data (zero-indexed). Default behavior is to infer the column names: if no + ``names`` are passed the behavior is identical to ``header=0`` and column names are inferred from the first line of the file, if column names are passed explicitly to ``names`` then the behavior is identical to @@ -876,7 +875,8 @@ def read_csv( Duplicates in this list are not allowed. index_col : Hashable, Sequence of Hashable or False, optional Column(s) to use as row label(s), denoted either by column labels or column - indices. If a sequence of labels or indices is given, :class:`~pandas.MultiIndex` + indices. If a sequence of labels or indices is given, + :class:`~pandas.MultiIndex` will be formed for the row labels. Note: ``index_col=False`` can be used to force pandas to *not* use the first @@ -911,24 +911,30 @@ def read_csv( of ``dtype`` conversion. .. versionadded:: 1.5.0 - Support for ``defaultdict`` was added. Specify a ``defaultdict`` as input where + Support for ``defaultdict`` was added. Specify a ``defaultdict`` as input + where the default determines the ``dtype`` of the columns which are not explicitly listed. engine : {'c', 'python', 'pyarrow'}, optional - Parser engine to use. The C and pyarrow engines are faster, while the python engine - is currently more feature-complete. Multithreading is currently only supported by + Parser engine to use. The C and pyarrow engines are faster, + while the python engine + is currently more feature-complete. + Multithreading is currently only supported by the pyarrow engine. .. versionadded:: 1.4.0 - The 'pyarrow' engine was added as an *experimental* engine, and some features + The 'pyarrow' engine was added as an *experimental* engine, + and some features are unsupported, or may not work correctly, with this engine. converters : dict of {Hashable : Callable}, optional Functions for converting values in specified columns. Keys can either be column labels or column indices. true_values : list, optional - Values to consider as ``True`` in addition to case-insensitive variants of 'True'. + Values to consider as ``True`` + in addition to case-insensitive variants of 'True'. false_values : list, optional - Values to consider as ``False`` in addition to case-insensitive variants of 'False'. + Values to consider as ``False`` + in addition to case-insensitive variants of 'False'. skipinitialspace : bool, default False Skip spaces after delimiter. skiprows : int, list of int or Callable, optional @@ -936,7 +942,8 @@ def read_csv( at the start of the file. If callable, the callable function will be evaluated against the row - indices, returning ``True`` if the row should be skipped and ``False`` otherwise. + indices, returning ``True`` if the row + should be skipped and ``False`` otherwise. An example of a valid callable argument would be ``lambda x: x in [0, 2]``. skipfooter : int, default 0 Number of lines at bottom of file to skip (Unsupported with ``engine='c'``). @@ -954,7 +961,8 @@ def read_csv( * To read rows 1,000,000 through 1,999,999: ``read_csv(..., skiprows=1000000, nrows=999999)`` - na_values : Hashable, Iterable of Hashable or dict of {Hashable : Iterable}, optional + na_values : Hashable, Iterable of Hashable or dict of {Hashable : Iterable}, + optional Additional strings to recognize as ``NA``/``NaN``. If ``dict`` passed, specific per-column ``NA`` values. By default the following values are interpreted as ``NaN``: "", "#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN", @@ -964,7 +972,8 @@ def read_csv( Whether or not to include the default ``NaN`` values when parsing the data. Depending on whether ``na_values`` is passed in, the behavior is as follows: - * If ``keep_default_na`` is ``True``, and ``na_values`` are specified, ``na_values`` + * If ``keep_default_na`` is ``True``, and ``na_values`` are specified, + ``na_values`` is appended to the default ``NaN`` values used for parsing. * If ``keep_default_na`` is ``True``, and ``na_values`` are not specified, only the default ``NaN`` values are used for parsing. @@ -973,7 +982,8 @@ def read_csv( * If ``keep_default_na`` is ``False``, and ``na_values`` are not specified, no strings will be parsed as ``NaN``. - Note that if ``na_filter`` is passed in as ``False``, the ``keep_default_na`` and + Note that if ``na_filter`` is passed in as ``False``, + the ``keep_default_na`` and ``na_values`` parameters will be ignored. na_filter : bool, default True Detect missing value markers (empty strings and the value of ``na_values``). In @@ -986,7 +996,8 @@ def read_csv( * ``bool``. If ``True`` -> try parsing the index. * ``None``. Behaves like ``True`` if ``date_format`` is specified. - * ``list`` of ``int`` or names. e.g. If ``[1, 2, 3]`` -> try parsing columns 1, 2, 3 + * ``list`` of ``int`` or names. e.g. If ``[1, 2, 3]`` -> try parsing + columns 1, 2, 3 each as a separate date column. If a column or index cannot be represented as an array of ``datetime``, @@ -997,7 +1008,8 @@ def read_csv( Note: A fast-path exists for iso8601-formatted dates. date_format : str or dict of column -> format, optional - Format to use for parsing dates and/or times when used in conjunction with ``parse_dates``. + Format to use for parsing dates and/or times + when used in conjunction with ``parse_dates``. The strftime to parse time, e.g. :const:`"%d/%m/%Y"`. See `strftime documentation `_ for more information on ``iterator`` and ``chunksize``. compression : str or dict, default 'infer' - For on-the-fly decompression of on-disk data. If 'infer' and 'filepath_or_buffer' is + For on-the-fly decompression of on-disk data. If 'infer' and + 'filepath_or_buffer' is path-like, then detect compression from the following extensions: '.gz', - '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2' - (otherwise no compression). - If using 'zip' or 'tar', the ZIP file must contain only one data file to be read in. + '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or ' + .tar.bz2' (otherwise no compression). + If using 'zip' or 'tar', the ZIP file must contain only one data + file to be read in. Set to ``None`` for no decompression. - Can also be a dict with key ``'method'`` set - to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``} and - other key-value pairs are forwarded to - ``zipfile.ZipFile``, ``gzip.GzipFile``, - ``bz2.BZ2File``, ``zstandard.ZstdDecompressor``, ``lzma.LZMAFile`` or - ``tarfile.TarFile``, respectively. - As an example, the following could be passed for Zstandard decompression using a - custom compression dictionary: + Can also be a dict with key ``'method'`` set to one of {``'zip'``, + ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``} and other + key-value pairs are forwarded to ``zipfile.ZipFile``, + ``gzip.GzipFile``, ``bz2.BZ2File``, ``zstandard.ZstdDecompressor``, + ``lzma.LZMAFile`` or ``tarfile.TarFile``, respectively. + As an example, the following could be passed for Zstandard + decompression using a custom compression dictionary: ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``. .. versionadded:: 1.5.0 @@ -1050,63 +1063,72 @@ def read_csv( thousands : str (length 1), optional Character acting as the thousands separator in numerical values. decimal : str (length 1), default '.' - Character to recognize as decimal point (e.g., use ',' for European data). + Character to recognize as decimal point (e.g., use ',' for European + data). lineterminator : str (length 1), optional Character used to denote a line break. Only valid with C parser. quotechar : str (length 1), optional Character used to denote the start and end of a quoted item. Quoted items can include the ``delimiter`` and it will be ignored. - quoting : {0 or csv.QUOTE_MINIMAL, 1 or csv.QUOTE_ALL, 2 or csv.QUOTE_NONNUMERIC, 3 or csv.QUOTE_NONE}, default csv.QUOTE_MINIMAL - Control field quoting behavior per ``csv.QUOTE_*`` constants. Default is - ``csv.QUOTE_MINIMAL`` (i.e., 0) which implies that only fields containing special - characters are quoted (e.g., characters defined in ``quotechar``, ``delimiter``, - or ``lineterminator``. + quoting : {0 or csv.QUOTE_MINIMAL, 1 or csv.QUOTE_ALL, 2 or + csv.QUOTE_NONNUMERIC, 3 or csv.QUOTE_NONE}, default + csv.QUOTE_MINIMAL + Control field quoting behavior per ``csv.QUOTE_*`` constants. + Default is ``csv.QUOTE_MINIMAL`` (i.e., 0) which implies that only + fields containing special characters are quoted (e.g., characters + defined in ``quotechar``, ``delimiter``, or ``lineterminator``. doublequote : bool, default True - When ``quotechar`` is specified and ``quoting`` is not ``QUOTE_NONE``, indicate - whether or not to interpret two consecutive ``quotechar`` elements INSIDE a - field as a single ``quotechar`` element. + When ``quotechar`` is specified and ``quoting`` is not + ``QUOTE_NONE``, indicate whether or not to interpret two + consecutive ``quotechar`` elements INSIDE a field as a single + ``quotechar`` element. escapechar : str (length 1), optional Character used to escape other characters. comment : str (length 1), optional - Character indicating that the remainder of line should not be parsed. - If found at the beginning - of a line, the line will be ignored altogether. This parameter must be a - single character. Like empty lines (as long as ``skip_blank_lines=True``), - fully commented lines are ignored by the parameter ``header`` but not by + Character indicating that the remainder of line should not be + parsed. If found at the beginning of a line, the line will be + ignored altogether. This parameter must be a single character. + Like empty lines (as long as ``skip_blank_lines=True``), fully + commented lines are ignored by the parameter ``header`` but not by ``skiprows``. For example, if ``comment='#'``, parsing - ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in ``'a,b,c'`` being - treated as the header. + ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in + ``'a,b,c'`` being treated as the header. encoding : str, optional, default 'utf-8' - Encoding to use for UTF when reading/writing (ex. ``'utf-8'``). `List of Python - standard encodings - `_ . + Encoding to use for UTF when reading/writing (ex. ``'utf-8'``). + `List of Python standard encodings + `_. encoding_errors : str, optional, default 'strict' How encoding errors are treated. `List of possible values - `_ . + `_. .. versionadded:: 1.3.0 dialect : str or csv.Dialect, optional - If provided, this parameter will override values (default or not) for the - following parameters: ``delimiter``, ``doublequote``, ``escapechar``, - ``skipinitialspace``, ``quotechar``, and ``quoting``. If it is necessary to - override values, a ``ParserWarning`` will be issued. See ``csv.Dialect`` + If provided, this parameter will override values (default or not) + for the following parameters: ``delimiter``, ``doublequote``, + ``escapechar``, ``skipinitialspace``, ``quotechar``, and + ``quoting``. If it is necessary to override values, a + ``ParserWarning`` will be issued. See ``csv.Dialect`` documentation for more details. on_bad_lines : {'error', 'warn', 'skip'} or Callable, default 'error' - Specifies what to do upon encountering a bad line (a line with too many fields). - Allowed values are: + Specifies what to do upon encountering a bad line (a line with too + many fields). Allowed values are: - ``'error'``, raise an Exception when a bad line is encountered. - - ``'warn'``, raise a warning when a bad line is encountered and skip that line. - - ``'skip'``, skip bad lines without raising or warning when they are encountered. + - ``'warn'``, raise a warning when a bad line is encountered and + skip that line. + - ``'skip'``, skip bad lines without raising or warning when they + are encountered. - Callable, function that will process a single bad line. - With ``engine='python'``, function with signature ``(bad_line: list[str]) -> list[str] | None``. ``bad_line`` is a list of strings split by the ``sep``. - If the function returns ``None``, the bad line will be ignored. - If the function returns a new ``list`` of strings with more elements than - expected, a ``ParserWarning`` will be emitted while dropping extra elements. - - With ``engine='pyarrow'``, function with signature - as described in pyarrow documentation: `invalid_row_handler + If the function returns ``None``, the bad line will be + ignored. + If the function returns a new ``list`` of strings with more + elements than expected, a ``ParserWarning`` will be emitted + while dropping extra elements. + - With ``engine='pyarrow'``, function with signature as + described in pyarrow documentation: `invalid_row_handler `_. @@ -1118,38 +1140,44 @@ def read_csv( .. versionchanged:: 2.2.0 Callable for ``engine='pyarrow'`` low_memory : bool, default True - Internally process the file in chunks, resulting in lower memory use - while parsing, but possibly mixed type inference. To ensure no mixed - types either set ``False``, or specify the type with the ``dtype`` parameter. - Note that the entire file is read into a single :class:`~pandas.DataFrame` - regardless, use the ``chunksize`` or ``iterator`` parameter to return the data in + Internally process the file in chunks, resulting in lower memory + use while parsing, but possibly mixed type inference. To ensure + no mixed types either set ``False``, or specify the type with the + ``dtype`` parameter. Note that the entire file is read into a + single :class:`~pandas.DataFrame` regardless, use the + ``chunksize`` or ``iterator`` parameter to return the data in chunks. (Only valid with C parser). memory_map : bool, default False - If a filepath is provided for ``filepath_or_buffer``, map the file object - directly onto memory and access the data directly from there. Using this - option can improve performance because there is no longer any I/O overhead. + If a filepath is provided for ``filepath_or_buffer``, map the file + object directly onto memory and access the data directly from + there. Using this option can improve performance because there is + no longer any I/O overhead. float_precision : {'high', 'legacy', 'round_trip'}, optional - Specifies which converter the C engine should use for floating-point - values. The options are ``None`` or ``'high'`` for the ordinary converter, - ``'legacy'`` for the original lower precision pandas converter, and - ``'round_trip'`` for the round-trip converter. + Specifies which converter the C engine should use for + floating-point values. The options are ``None`` or ``'high'`` for + the ordinary converter, ``'legacy'`` for the original lower + precision pandas converter, and ``'round_trip'`` for the + round-trip converter. storage_options : dict, optional - Extra options that make sense for a particular storage connection, e.g. - host, port, username, password, etc. For HTTP(S) URLs the key-value pairs - are forwarded to ``urllib.request.Request`` as header options. For other - URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are - forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more - details, and for more examples on storage options refer `here + Extra options that make sense for a particular storage connection, + e.g. host, port, username, password, etc. For HTTP(S) URLs the + key-value pairs are forwarded to ``urllib.request.Request`` as + header options. For other URLs (e.g. starting with "s3://", and + "gcs://") the key-value pairs are forwarded to ``fsspec.open``. + Please see ``fsspec`` and ``urllib`` for more details, and for + more examples on storage options refer `here `_. dtype_backend : {'numpy_nullable', 'pyarrow'} Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). If not specified, the default behavior - is to not use nullable data types. If specified, the behavior - is as follows: + (still experimental). If not specified, the default behavior is + to not use nullable data types. If specified, the behavior is as + follows: - * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` :class:`DataFrame` + * ``"numpy_nullable"``: returns nullable-dtype-backed + :class:`DataFrame` + * ``"pyarrow"``: returns pyarrow-backed nullable + :class:`ArrowDtype` :class:`DataFrame` .. versionadded:: 2.0 @@ -1161,37 +1189,40 @@ def read_csv( See Also -------- - DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file. + DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) + file. read_table : Read general delimited file into DataFrame. read_fwf : Read a table of fixed-width formatted lines into DataFrame. Examples -------- - >>> pd.read_csv('data.csv') # doctest: +SKIP + >>> pd.read_csv("data.csv") # doctest: +SKIP Name Value 0 foo 1 1 bar 2 2 #baz 3 - Index and header can be specified via the `index_col` and `header` arguments. + Index and header can be specified via the `index_col` and `header` + arguments. - >>> pd.read_csv('data.csv', header=None) # doctest: +SKIP + >>> pd.read_csv("data.csv", header=None) # doctest: +SKIP 0 1 0 Name Value 1 foo 1 2 bar 2 3 #baz 3 - >>> pd.read_csv('data.csv', index_col='Value') # doctest: +SKIP + >>> pd.read_csv("data.csv", index_col="Value") # doctest: +SKIP Name Value 1 foo 2 bar 3 #baz - Column types are inferred but can be explicitly specified using the dtype argument. + Column types are inferred but can be explicitly specified using the + dtype argument. - >>> pd.read_csv('data.csv', dtype={'Value': float}) # doctest: +SKIP + >>> pd.read_csv("data.csv", dtype={"Value": float}) # doctest: +SKIP Name Value 0 foo 1.0 1 bar 2.0 @@ -1201,22 +1232,24 @@ def read_csv( but can be explicitly specified, too. Supply the values you would like as strings or lists of strings! - >>> pd.read_csv('data.csv', na_values=['foo', 'bar']) # doctest: +SKIP + >>> pd.read_csv("data.csv", na_values=["foo", "bar"]) # doctest: +SKIP Name Value 0 NaN 1 1 NaN 2 2 #baz 3 - Comment lines in the input file can be skipped using the `comment` argument. + Comment lines in the input file can be skipped using the `comment` + argument. - >>> pd.read_csv('data.csv', comment='#') # doctest: +SKIP + >>> pd.read_csv("data.csv", comment="#") # doctest: +SKIP Name Value 0 foo 1 1 bar 2 - By default, columns with dates will be read as ``object`` rather than ``datetime``. + By default, columns with dates will be read as ``object`` rather than + ``datetime``. - >>> df = pd.read_csv('tmp.csv') # doctest: +SKIP + >>> df = pd.read_csv("tmp.csv") # doctest: +SKIP >>> df # doctest: +SKIP col 1 col 2 col 3 @@ -1233,9 +1266,9 @@ def read_csv( `date_format` arguments. >>> df = pd.read_csv( - ... 'tmp.csv', + ... "tmp.csv", ... parse_dates=[1, 2], - ... date_format={'col 2': '%d/%m/%Y', 'col 3': '%a %d %b %Y'}, + ... date_format={"col 2": "%d/%m/%Y", "col 3": "%a %d %b %Y"}, ... ) # doctest: +SKIP >>> df.dtypes # doctest: +SKIP @@ -1362,7 +1395,8 @@ def read_table( storage_options: StorageOptions | None = None, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, ) -> DataFrame | TextFileReader: - """Read general delimited file into DataFrame. + """ + Read general delimited file into DataFrame. Also supports optionally iterating or breaking of the file into chunks. @@ -1388,90 +1422,95 @@ def read_table( be used and automatically detect the separator from only the first valid row of the file by Python's builtin sniffer tool, ``csv.Sniffer``. In addition, separators longer than 1 character and different from - ``'\s+'`` will be interpreted as regular expressions and will also force + ``'\\s+'`` will be interpreted as regular expressions and will also force the use of the Python parsing engine. Note that regex delimiters are prone to ignoring quoted data. Regex example: ``'\r\t'``. delimiter : str, optional Alias for ``sep``. header : int, Sequence of int, 'infer' or None, default 'infer' Row number(s) containing column labels and marking the start of the - data (zero-indexed). Default behavior is to infer the column names: if no ``names`` - are passed the behavior is identical to ``header=0`` and column - names are inferred from the first line of the file, if column - names are passed explicitly to ``names`` then the behavior is identical to - ``header=None``. Explicitly pass ``header=0`` to be able to - replace existing names. The header can be a list of integers that - specify row locations for a :class:`~pandas.MultiIndex` on the columns - e.g. ``[0, 1, 3]``. Intervening rows that are not specified will be - skipped (e.g. 2 in this example is skipped). Note that this + data (zero-indexed). Default behavior is to infer the column names: + if no ``names`` are passed the behavior is identical to ``header=0`` + and column names are inferred from the first line of the file, if + column names are passed explicitly to ``names`` then the behavior is + identical to ``header=None``. Explicitly pass ``header=0`` to be + able to replace existing names. The header can be a list of integers + that specify row locations for a :class:`~pandas.MultiIndex` on the + columns e.g. ``[0, 1, 3]``. Intervening rows that are not specified + will be skipped (e.g. 2 in this example is skipped). Note that this parameter ignores commented lines and empty lines if ``skip_blank_lines=True``, so ``header=0`` denotes the first line of data rather than the first line of the file. When inferred from the file contents, headers are kept distinct from - each other by renaming duplicate names with a numeric suffix of the form - ``".{count}"`` starting from 1, e.g. ``"foo"`` and ``"foo.1"``. - Empty headers are named ``"Unnamed: {i}"`` or ``"Unnamed: {i}_level_{level}"`` - in the case of MultiIndex columns. + each other by renaming duplicate names with a numeric suffix of the + form ``".{count}"`` starting from 1, e.g. ``"foo"`` and ``"foo.1"``. + Empty headers are named ``"Unnamed: {i}"`` or + ``"Unnamed: {i}_level_{level}"`` in the case of MultiIndex columns. names : Sequence of Hashable, optional Sequence of column labels to apply. If the file contains a header row, - then you should explicitly pass ``header=0`` to override the column names. - Duplicates in this list are not allowed. + then you should explicitly pass ``header=0`` to override the column + names. Duplicates in this list are not allowed. index_col : Hashable, Sequence of Hashable or False, optional - Column(s) to use as row label(s), denoted either by column labels or column - indices. If a sequence of labels or indices is given, :class:`~pandas.MultiIndex` - will be formed for the row labels. + Column(s) to use as row label(s), denoted either by column labels or + column indices. If a sequence of labels or indices is given, + :class:`~pandas.MultiIndex` will be formed for the row labels. - Note: ``index_col=False`` can be used to force pandas to *not* use the first - column as the index, e.g., when you have a malformed file with delimiters at - the end of each line. + Note: ``index_col=False`` can be used to force pandas to *not* use + the first column as the index, e.g., when you have a malformed file + with delimiters at the end of each line. usecols : Sequence of Hashable or Callable, optional - Subset of columns to select, denoted either by column labels or column indices. - If list-like, all elements must either - be positional (i.e. integer indices into the document columns) or strings - that correspond to column names provided either by the user in ``names`` or - inferred from the document header row(s). If ``names`` are given, the document - header row(s) are not taken into account. For example, a valid list-like - ``usecols`` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``. - Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. - To instantiate a :class:`~pandas.DataFrame` from ``data`` with element order - preserved use ``pd.read_table(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` + Subset of columns to select, denoted either by column labels or + column indices. If list-like, all elements must either + be positional (i.e. integer indices into the document columns) or + strings that correspond to column names provided either by the user + in ``names`` or inferred from the document header row(s). If + ``names`` are given, the document header row(s) are not taken into + account. For example, a valid list-like ``usecols`` parameter would + be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``. Element order is + ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. To + instantiate a :class:`~pandas.DataFrame` from ``data`` with element + order preserved use + ``pd.read_table(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` for columns in ``['foo', 'bar']`` order or ``pd.read_table(data, usecols=['foo', 'bar'])[['bar', 'foo']]`` for ``['bar', 'foo']`` order. - If callable, the callable function will be evaluated against the column - names, returning names where the callable function evaluates to ``True``. An - example of a valid callable argument would be ``lambda x: x.upper() in - ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster - parsing time and lower memory usage. + If callable, the callable function will be evaluated against the + column names, returning names where the callable function evaluates + to ``True``. An example of a valid callable argument would be + ``lambda x: x.upper() in ['AAA', 'BBB', 'DDD']``. Using this + parameter results in much faster parsing time and lower memory usage. dtype : dtype or dict of {Hashable : dtype}, optional - Data type(s) to apply to either the whole dataset or individual columns. - E.g., ``{'a': np.float64, 'b': np.int32, 'c': 'Int64'}`` - Use ``str`` or ``object`` together with suitable ``na_values`` settings - to preserve and not interpret ``dtype``. + Data type(s) to apply to either the whole dataset or individual + columns. E.g., ``{'a': np.float64, 'b': np.int32, 'c': 'Int64'}`` + Use ``str`` or ``object`` together with suitable ``na_values`` + settings to preserve and not interpret ``dtype``. If ``converters`` are specified, they will be applied INSTEAD of ``dtype`` conversion. .. versionadded:: 1.5.0 - Support for ``defaultdict`` was added. Specify a ``defaultdict`` as input where - the default determines the ``dtype`` of the columns which are not explicitly - listed. + Support for ``defaultdict`` was added. Specify a ``defaultdict`` + as input where the default determines the ``dtype`` of the + columns which are not explicitly listed. engine : {'c', 'python', 'pyarrow'}, optional - Parser engine to use. The C and pyarrow engines are faster, while the python engine - is currently more feature-complete. Multithreading is currently only supported by - the pyarrow engine. + Parser engine to use. The C and pyarrow engines are faster, while + the python engine is currently more feature-complete. Multithreading + is currently only supported by the pyarrow engine. .. versionadded:: 1.4.0 - The 'pyarrow' engine was added as an *experimental* engine, and some features - are unsupported, or may not work correctly, with this engine. + The 'pyarrow' engine was added as an *experimental* engine, and + some features are unsupported, or may not work correctly, with + this engine. converters : dict of {Hashable : Callable}, optional Functions for converting values in specified columns. Keys can either be column labels or column indices. true_values : list, optional - Values to consider as ``True`` in addition to case-insensitive variants of 'True'. + Values to consider as ``True`` in addition to case-insensitive + variants of 'True'. false_values : list, optional - Values to consider as ``False`` in addition to case-insensitive variants of 'False'. + Values to consider as ``False`` in addition to case-insensitive + variants of 'False'. skipinitialspace : bool, default False Skip spaces after delimiter. skiprows : int, list of int or Callable, optional @@ -1479,13 +1518,16 @@ def read_table( at the start of the file. If callable, the callable function will be evaluated against the row - indices, returning ``True`` if the row should be skipped and ``False`` otherwise. - An example of a valid callable argument would be ``lambda x: x in [0, 2]``. + indices, returning ``True`` if the row should be skipped and ``False`` + otherwise. An example of a valid callable argument would be + ``lambda x: x in [0, 2]``. skipfooter : int, default 0 - Number of lines at bottom of file to skip (Unsupported with ``engine='c'``). + Number of lines at bottom of file to skip (Unsupported with + ``engine='c'``). nrows : int, optional - Number of rows of file to read. Useful for reading pieces of large files. - Refers to the number of data rows in the returned DataFrame, excluding: + Number of rows of file to read. Useful for reading pieces of large + files. Refers to the number of data rows in the returned DataFrame, + excluding: * The header row containing column names. * Rows before the header row, if ``header=1`` or larger. @@ -1497,93 +1539,100 @@ def read_table( * To read rows 1,000,000 through 1,999,999: ``read_table(..., skiprows=1000000, nrows=999999)`` - na_values : Hashable, Iterable of Hashable or dict of {Hashable : Iterable}, optional - Additional strings to recognize as ``NA``/``NaN``. If ``dict`` passed, specific - per-column ``NA`` values. By default the following values are interpreted as - ``NaN``: "", "#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN", - "-nan", "1.#IND", "1.#QNAN", "", "N/A", "NA", "NULL", "NaN", - "None", "n/a", "nan", "null". + na_values : Hashable, Iterable of Hashable or dict of {Hashable : + Iterable}, optional + Additional strings to recognize as ``NA``/``NaN``. If ``dict`` + passed, specific per-column ``NA`` values. By default the following + values are interpreted as ``NaN``: "", "#N/A", "#N/A N/A", "#NA", + "-1.#IND", "-1.#QNAN", "-NaN", "-nan", "1.#IND", "1.#QNAN", "", + "N/A", "NA", "NULL", "NaN", "None", "n/a", "nan", "null". keep_default_na : bool, default True - Whether or not to include the default ``NaN`` values when parsing the data. - Depending on whether ``na_values`` is passed in, the behavior is as follows: - - * If ``keep_default_na`` is ``True``, and ``na_values`` are specified, ``na_values`` - is appended to the default ``NaN`` values used for parsing. - * If ``keep_default_na`` is ``True``, and ``na_values`` are not specified, only - the default ``NaN`` values are used for parsing. - * If ``keep_default_na`` is ``False``, and ``na_values`` are specified, only - the ``NaN`` values specified ``na_values`` are used for parsing. - * If ``keep_default_na`` is ``False``, and ``na_values`` are not specified, no - strings will be parsed as ``NaN``. - - Note that if ``na_filter`` is passed in as ``False``, the ``keep_default_na`` and - ``na_values`` parameters will be ignored. + Whether or not to include the default ``NaN`` values when parsing + the data. Depending on whether ``na_values`` is passed in, the + behavior is as follows: + + * If ``keep_default_na`` is ``True``, and ``na_values`` are + specified, ``na_values`` is appended to the default ``NaN`` values + used for parsing. + * If ``keep_default_na`` is ``True``, and ``na_values`` are not + specified, only the default ``NaN`` values are used for parsing. + * If ``keep_default_na`` is ``False``, and ``na_values`` are + specified, only the ``NaN`` values specified ``na_values`` are + used for parsing. + * If ``keep_default_na`` is ``False``, and ``na_values`` are not + specified, no strings will be parsed as ``NaN``. + + Note that if ``na_filter`` is passed in as ``False``, the + ``keep_default_na`` and ``na_values`` parameters will be ignored. na_filter : bool, default True - Detect missing value markers (empty strings and the value of ``na_values``). In - data without any ``NA`` values, passing ``na_filter=False`` can improve the - performance of reading a large file. + Detect missing value markers (empty strings and the value of + ``na_values``). In data without any ``NA`` values, passing + ``na_filter=False`` can improve the performance of reading a large + file. skip_blank_lines : bool, default True - If ``True``, skip over blank lines rather than interpreting as ``NaN`` values. + If ``True``, skip over blank lines rather than interpreting as + ``NaN`` values. parse_dates : bool, None, list of Hashable, default None The behavior is as follows: * ``bool``. If ``True`` -> try parsing the index. * ``None``. Behaves like ``True`` if ``date_format`` is specified. - * ``list`` of ``int`` or names. e.g. If ``[1, 2, 3]`` -> try parsing columns 1, 2, 3 - each as a separate date column. + * ``list`` of ``int`` or names. e.g. If ``[1, 2, 3]`` -> try + parsing columns 1, 2, 3 each as a separate date column. - If a column or index cannot be represented as an array of ``datetime``, - say because of an unparsable value or a mixture of timezones, the column - or index will be returned unaltered as an ``object`` data type. For - non-standard ``datetime`` parsing, use :func:`~pandas.to_datetime` after - :func:`~pandas.read_table`. + If a column or index cannot be represented as an array of + ``datetime``, say because of an unparsable value or a mixture of + timezones, the column or index will be returned unaltered as an + ``object`` data type. For non-standard ``datetime`` parsing, use + :func:`~pandas.to_datetime` after :func:`~pandas.read_table`. Note: A fast-path exists for iso8601-formatted dates. date_format : str or dict of column -> format, optional - Format to use for parsing dates and/or times when used in conjunction with ``parse_dates``. - The strftime to parse time, e.g. :const:`"%d/%m/%Y"`. See - `strftime documentation + Format to use for parsing dates and/or times when used in conjunction + with ``parse_dates``. The strftime to parse time, e.g. + :const:`"%d/%m/%Y"`. See `strftime documentation `_ for more information on choices, though - note that :const:`"%f"`` will parse all the way up to nanoseconds. - You can also pass: + #strftime-and-strptime-behavior>`_ for more information on choices, + though note that :const:`"%f"`` will parse all the way up to + nanoseconds. You can also pass: - - "ISO8601", to parse any `ISO8601 `_ - time string (not necessarily in exactly the same format); - - "mixed", to infer the format for each element individually. This is risky, - and you should probably use it along with `dayfirst`. + - "ISO8601", to parse any `ISO8601 + `_ time string (not + necessarily in exactly the same format); + - "mixed", to infer the format for each element individually. This + is risky, and you should probably use it along with `dayfirst`. .. versionadded:: 2.0.0 dayfirst : bool, default False DD/MM format dates, international and European format. cache_dates : bool, default True - If ``True``, use a cache of unique, converted dates to apply the ``datetime`` - conversion. May produce significant speed-up when parsing duplicate - date strings, especially ones with timezone offsets. + If ``True``, use a cache of unique, converted dates to apply the + ``datetime`` conversion. May produce significant speed-up when + parsing duplicate date strings, especially ones with timezone + offsets. iterator : bool, default False - Return ``TextFileReader`` object for iteration or getting chunks with - ``get_chunk()``. + Return ``TextFileReader`` object for iteration or getting chunks + with ``get_chunk()``. chunksize : int, optional - Number of lines to read from the file per chunk. Passing a value will cause the - function to return a ``TextFileReader`` object for iteration. - See the `IO Tools docs + Number of lines to read from the file per chunk. Passing a value + will cause the function to return a ``TextFileReader`` object for + iteration. See the `IO Tools docs `_ for more information on ``iterator`` and ``chunksize``. compression : str or dict, default 'infer' - For on-the-fly decompression of on-disk data. If 'infer' and 'filepath_or_buffer' is - path-like, then detect compression from the following extensions: '.gz', - '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2' - (otherwise no compression). - If using 'zip' or 'tar', the ZIP file must contain only one data file to be read in. - Set to ``None`` for no decompression. - Can also be a dict with key ``'method'`` set - to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``} and - other key-value pairs are forwarded to - ``zipfile.ZipFile``, ``gzip.GzipFile``, - ``bz2.BZ2File``, ``zstandard.ZstdDecompressor``, ``lzma.LZMAFile`` or - ``tarfile.TarFile``, respectively. - As an example, the following could be passed for Zstandard decompression using a - custom compression dictionary: + For on-the-fly decompression of on-disk data. If 'infer' and + 'filepath_or_buffer' is path-like, then detect compression from the + following extensions: '.gz', '.bz2', '.zip', '.xz', '.zst', '.tar', + '.tar.gz', '.tar.xz' or '.tar.bz2' (otherwise no compression). + If using 'zip' or 'tar', the ZIP file must contain only one data + file to be read in. Set to ``None`` for no decompression. + Can also be a dict with key ``'method'`` set to one of + {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``} + and other key-value pairs are forwarded to ``zipfile.ZipFile``, + ``gzip.GzipFile``, ``bz2.BZ2File``, ``zstandard.ZstdDecompressor``, + ``lzma.LZMAFile`` or ``tarfile.TarFile``, respectively. + As an example, the following could be passed for Zstandard + decompression using a custom compression dictionary: ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``. .. versionadded:: 1.5.0 @@ -1593,61 +1642,67 @@ def read_table( thousands : str (length 1), optional Character acting as the thousands separator in numerical values. decimal : str (length 1), default '.' - Character to recognize as decimal point (e.g., use ',' for European data). + Character to recognize as decimal point (e.g., use ',' for European + data). lineterminator : str (length 1), optional Character used to denote a line break. Only valid with C parser. quotechar : str (length 1), default '"' Character used to denote the start and end of a quoted item. Quoted items can include the ``delimiter`` and it will be ignored. - quoting : {0 or csv.QUOTE_MINIMAL, 1 or csv.QUOTE_ALL, 2 or csv.QUOTE_NONNUMERIC, 3 or csv.QUOTE_NONE}, default csv.QUOTE_MINIMAL - Control field quoting behavior per ``csv.QUOTE_*`` constants. Default is - ``csv.QUOTE_MINIMAL`` (i.e., 0) which implies that only fields containing special - characters are quoted (e.g., characters defined in ``quotechar``, ``delimiter``, - or ``lineterminator``. + quoting : {0 or csv.QUOTE_MINIMAL, 1 or csv.QUOTE_ALL, + 2 or csv.QUOTE_NONNUMERIC, 3 or csv.QUOTE_NONE}, default + csv.QUOTE_MINIMAL + Control field quoting behavior per ``csv.QUOTE_*`` constants. + Default is ``csv.QUOTE_MINIMAL`` (i.e., 0) which implies that only + fields containing special characters are quoted (e.g., characters + defined in ``quotechar``, ``delimiter``, or ``lineterminator``. doublequote : bool, default True - When ``quotechar`` is specified and ``quoting`` is not ``QUOTE_NONE``, indicate - whether or not to interpret two consecutive ``quotechar`` elements INSIDE a - field as a single ``quotechar`` element. + When ``quotechar`` is specified and ``quoting`` is not ``QUOTE_NONE``, + indicate whether or not to interpret two consecutive ``quotechar`` + elements INSIDE a field as a single ``quotechar`` element. escapechar : str (length 1), optional Character used to escape other characters. comment : str (length 1), optional - Character indicating that the remainder of line should not be parsed. - If found at the beginning - of a line, the line will be ignored altogether. This parameter must be a - single character. Like empty lines (as long as ``skip_blank_lines=True``), - fully commented lines are ignored by the parameter ``header`` but not by + Character indicating that the remainder of line should not be + parsed. If found at the beginning of a line, the line will be + ignored altogether. This parameter must be a single character. Like + empty lines (as long as ``skip_blank_lines=True``), fully commented + lines are ignored by the parameter ``header`` but not by ``skiprows``. For example, if ``comment='#'``, parsing - ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in ``'a,b,c'`` being - treated as the header. + ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in + ``'a,b,c'`` being treated as the header. encoding : str, optional, default 'utf-8' - Encoding to use for UTF when reading/writing (ex. ``'utf-8'``). `List of Python - standard encodings - `_ . + Encoding to use for UTF when reading/writing (ex. ``'utf-8'``). + `List of Python standard encodings + `_. encoding_errors : str, optional, default 'strict' How encoding errors are treated. `List of possible values - `_ . + `_. .. versionadded:: 1.3.0 dialect : str or csv.Dialect, optional - If provided, this parameter will override values (default or not) for the - following parameters: ``delimiter``, ``doublequote``, ``escapechar``, - ``skipinitialspace``, ``quotechar``, and ``quoting``. If it is necessary to - override values, a ``ParserWarning`` will be issued. See ``csv.Dialect`` - documentation for more details. + If provided, this parameter will override values (default or not) + for the following parameters: ``delimiter``, ``doublequote``, + ``escapechar``, ``skipinitialspace``, ``quotechar``, and ``quoting``. + If it is necessary to override values, a ``ParserWarning`` will be + issued. See ``csv.Dialect`` documentation for more details. on_bad_lines : {'error', 'warn', 'skip'} or Callable, default 'error' - Specifies what to do upon encountering a bad line (a line with too many fields). - Allowed values are: + Specifies what to do upon encountering a bad line (a line with too + many fields). Allowed values are: - ``'error'``, raise an Exception when a bad line is encountered. - - ``'warn'``, raise a warning when a bad line is encountered and skip that line. - - ``'skip'``, skip bad lines without raising or warning when they are encountered. + - ``'warn'``, raise a warning when a bad line is encountered and + skip that line. + - ``'skip'``, skip bad lines without raising or warning when they + are encountered. - Callable, function that will process a single bad line. - With ``engine='python'``, function with signature ``(bad_line: list[str]) -> list[str] | None``. ``bad_line`` is a list of strings split by the ``sep``. If the function returns ``None``, the bad line will be ignored. - If the function returns a new ``list`` of strings with more elements than - expected, a ``ParserWarning`` will be emitted while dropping extra elements. + If the function returns a new ``list`` of strings with more + elements than expected, a ``ParserWarning`` will be emitted + while dropping extra elements. - With ``engine='pyarrow'``, function with signature as described in pyarrow documentation: `invalid_row_handler `_. dtype_backend : {'numpy_nullable', 'pyarrow'} @@ -1691,8 +1749,10 @@ def read_table( is to not use nullable data types. If specified, the behavior is as follows: - * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` :class:`DataFrame` + * ``"numpy_nullable"``: returns nullable-dtype-backed + :class:`DataFrame` + * ``"pyarrow"``: returns pyarrow-backed nullable + :class:`ArrowDtype` :class:`DataFrame` .. versionadded:: 2.0 @@ -1704,13 +1764,13 @@ def read_table( See Also -------- - DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file. + DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) + file. read_csv : Read a comma-separated values (csv) file into DataFrame. read_fwf : Read a table of fixed-width formatted lines into DataFrame. - Examples -------- - >>> pd.read_table('data.txt') # doctest: +SKIP + >>> pd.read_table("data.txt") # doctest: +SKIP Name Value 0 foo 1 1 bar 2 @@ -1718,14 +1778,14 @@ def read_table( Index and header can be specified via the `index_col` and `header` arguments. - >>> pd.read_table('data.txt', header=None) # doctest: +SKIP + >>> pd.read_table("data.txt", header=None) # doctest: +SKIP 0 1 0 Name Value 1 foo 1 2 bar 2 3 #baz 3 - >>> pd.read_table('data.txt', index_col='Value') # doctest: +SKIP + >>> pd.read_table("data.txt", index_col="Value") # doctest: +SKIP Name Value 1 foo @@ -1734,7 +1794,7 @@ def read_table( Column types are inferred but can be explicitly specified using the dtype argument. - >>> pd.read_table('data.txt', dtype={'Value': float}) # doctest: +SKIP + >>> pd.read_table("data.txt", dtype={"Value": float}) # doctest: +SKIP Name Value 0 foo 1.0 1 bar 2.0 @@ -1744,7 +1804,7 @@ def read_table( but can be explicitly specified, too. Supply the values you would like as strings or lists of strings! - >>> pd.read_table('data.txt', na_values=['foo', 'bar']) # doctest: +SKIP + >>> pd.read_table("data.txt", na_values=["foo", "bar"]) # doctest: +SKIP Name Value 0 NaN 1 1 NaN 2 @@ -1752,14 +1812,14 @@ def read_table( Comment lines in the input file can be skipped using the `comment` argument. - >>> pd.read_table('data.txt', comment='#') # doctest: +SKIP + >>> pd.read_table("data.txt", comment="#") # doctest: +SKIP Name Value 0 foo 1 1 bar 2 By default, columns with dates will be read as ``object`` rather than ``datetime``. - >>> df = pd.read_table('tmp.txt') # doctest: +SKIP + >>> df = pd.read_table("tmp.txt") # doctest: +SKIP >>> df # doctest: +SKIP col 1 col 2 col 3 @@ -1776,9 +1836,9 @@ def read_table( `date_format` arguments. >>> df = pd.read_table( - ... 'tmp.txt', + ... "tmp.txt", ... parse_dates=[1, 2], - ... date_format={'col 2': '%d/%m/%Y', 'col 3': '%a %d %b %Y'}, + ... date_format={"col 2": "%d/%m/%Y", "col 3": "%a %d %b %Y"}, ... ) # doctest: +SKIP >>> df.dtypes # doctest: +SKIP From dba41de104e1d2b3d796a3cb29177ab58af192a1 Mon Sep 17 00:00:00 2001 From: huhu-dsy Date: Wed, 15 Oct 2025 20:11:45 +0800 Subject: [PATCH 03/24] Update readers.py --- pandas/io/parsers/readers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index f647714f66b3d..59848ffc3be4c 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1768,6 +1768,7 @@ def read_table( file. read_csv : Read a comma-separated values (csv) file into DataFrame. read_fwf : Read a table of fixed-width formatted lines into DataFrame. + Examples -------- >>> pd.read_table("data.txt") # doctest: +SKIP From 9567a310770a59f4b06f0774a37cdd1faf7c24e5 Mon Sep 17 00:00:00 2001 From: huhu-dsy Date: Thu, 16 Oct 2025 01:38:21 +0800 Subject: [PATCH 04/24] Update readers.py --- pandas/io/parsers/readers.py | 62 +++++++++++++++++++++++++----------- 1 file changed, 43 insertions(+), 19 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 59848ffc3be4c..a3f0849ece94d 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -112,7 +112,8 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): skipfooter: int nrows: int | None na_values: ( - Hashable | Iterable[Hashable] | Mapping[Hashable, Iterable[Hashable]] | None + Hashable | Iterable[Hashable] | Mapping[Hashable, + Iterable[Hashable]] | None ) keep_default_na: bool na_filter: bool @@ -577,7 +578,10 @@ class _Fwf_Defaults(TypedDict): widths: None -_fwf_defaults: _Fwf_Defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None} +_fwf_defaults: _Fwf_Defaults = { + "colspecs": "infer", + "infer_nrows": 100, + "widths": None} _c_unsupported = {"skipfooter"} _python_unsupported = {"low_memory", "float_precision"} _pyarrow_unsupported = { @@ -608,7 +612,10 @@ def validate_integer(name: str, val: float, min_val: int = ...) -> int: ... @overload -def validate_integer(name: str, val: int | None, min_val: int = ...) -> int | None: ... +def validate_integer( + name: str, + val: int | None, + min_val: int = ...) -> int | None: ... def validate_integer( @@ -662,7 +669,9 @@ def _validate_names(names: Sequence[Hashable] | None) -> None: if len(names) != len(set(names)): raise ValueError("Duplicate names are not allowed.") if not ( - is_list_like(names, allow_sets=False) or isinstance(names, abc.KeysView) + is_list_like( + names, allow_sets=False) or isinstance( + names, abc.KeysView) ): raise ValueError("Names should be an ordered collection.") @@ -781,7 +790,8 @@ def read_csv( nrows: int | None = None, # NA and Missing Data Handling na_values: ( - Hashable | Iterable[Hashable] | Mapping[Hashable, Iterable[Hashable]] | None + Hashable | Iterable[Hashable] | Mapping[Hashable, + Iterable[Hashable]] | None ) = None, keep_default_na: bool = True, na_filter: bool = True, @@ -1360,7 +1370,8 @@ def read_table( nrows: int | None = None, # NA and Missing Data Handling na_values: ( - Hashable | Iterable[Hashable] | Mapping[Hashable, Iterable[Hashable]] | None + Hashable | Iterable[Hashable] | Mapping[Hashable, + Iterable[Hashable]] | None ) = None, keep_default_na: bool = True, na_filter: bool = True, @@ -1974,7 +1985,8 @@ def read_fwf( if colspecs is None and widths is None: raise ValueError("Must specify either colspecs or widths") if colspecs not in (None, "infer") and widths is not None: - raise ValueError("You must specify only one of 'widths' and 'colspecs'") + raise ValueError( + "You must specify only one of 'widths' and 'colspecs'") # Compute 'colspecs' from 'widths', if specified. if widths is not None: @@ -2004,9 +2016,11 @@ def read_fwf( assert index_col is not lib.no_default len_index = len(index_col) - if kwds.get("usecols") is None and len(names) + len_index != len(colspecs): + if kwds.get("usecols") is None and len( + names) + len_index != len(colspecs): # If usecols is used colspec may be longer than names - raise ValueError("Length of colspecs must match length of names") + raise ValueError( + "Length of colspecs must match length of names") check_dtype_backend(kwds.setdefault("dtype_backend", lib.no_default)) return _read( @@ -2098,7 +2112,8 @@ def _get_options_with_defaults(self, engine: CSVEngine) -> dict[str, Any]: and value != getattr(value, "value", default) ): raise ValueError( - f"The {argname!r} option is not supported with the 'pyarrow' engine" + f"The { + argname!r} option is not supported with the 'pyarrow' engine" ) options[argname] = value @@ -2114,7 +2129,8 @@ def _get_options_with_defaults(self, engine: CSVEngine) -> dict[str, Any]: pass else: raise ValueError( - f"The {argname!r} option is not supported with the " + f"The { + argname!r} option is not supported with the " f"{engine!r} engine" ) else: @@ -2215,7 +2231,8 @@ def _clean_options( if "python" in engine: for arg in _python_unsupported: - if fallback_reason and result[arg] != _c_parser_defaults.get(arg): + if fallback_reason and result[arg] != _c_parser_defaults.get( + arg): raise ValueError( "Falling back to the 'python' engine because " f"{fallback_reason}, but this causes {arg!r} to be " @@ -2314,7 +2331,8 @@ def _make_engine( if engine not in mapping: raise ValueError( - f"Unknown engine: {engine} (valid options are {mapping.keys()})" + f"Unknown engine: {engine} (valid options are { + mapping.keys()})" ) if not isinstance(f, list): # open file here @@ -2399,7 +2417,8 @@ def read(self, nrows: int | None = None) -> DataFrame: dtype_arg = None if isinstance(dtype_arg, dict): - dtype = defaultdict(lambda: None) # type: ignore[var-annotated] + # type: ignore[var-annotated] + dtype = defaultdict(lambda: None) dtype.update(dtype_arg) elif dtype_arg is not None and pandas_dtype(dtype_arg) in ( np.str_, @@ -2417,7 +2436,8 @@ def read(self, nrows: int | None = None) -> DataFrame: if pandas_dtype(dtype[k]) in (np.str_, np.object_) else None ) - new_col_dict[k] = Series(v, index=index, dtype=d, copy=False) + new_col_dict[k] = Series( + v, index=index, dtype=d, copy=False) else: new_col_dict = col_dict @@ -2508,7 +2528,8 @@ def TextParser(*args, **kwds) -> TextFileReader: return TextFileReader(*args, **kwds) -def _clean_na_values(na_values, keep_default_na: bool = True, floatify: bool = True): +def _clean_na_values(na_values, keep_default_na: bool = True, + floatify: bool = True): na_fvalues: set | dict if na_values is None: if keep_default_na: @@ -2648,7 +2669,8 @@ def _refine_defaults_read( ) if delimiter and (sep is not lib.no_default): - raise ValueError("Specified a sep and a delimiter; you can only specify one.") + raise ValueError( + "Specified a sep and a delimiter; you can only specify one.") kwds["names"] = None if names is lib.no_default else names @@ -2689,7 +2711,8 @@ def _refine_defaults_read( ) kwds["on_bad_lines"] = on_bad_lines else: - raise ValueError(f"Argument {on_bad_lines} is invalid for on_bad_lines") + raise ValueError( + f"Argument {on_bad_lines} is invalid for on_bad_lines") check_dtype_backend(dtype_backend) @@ -2711,7 +2734,8 @@ def _extract_dialect(kwds: dict[str, str | csv.Dialect]) -> csv.Dialect | None: dialect = kwds["dialect"] if isinstance(dialect, str) and dialect in csv.list_dialects(): - # get_dialect is typed to return a `_csv.Dialect` for some reason in typeshed + # get_dialect is typed to return a `_csv.Dialect` for some reason in + # typeshed tdialect = cast(csv.Dialect, csv.get_dialect(dialect)) _validate_dialect(tdialect) From 38fa19a18501125b3388361355d3fe624b7d7c3b Mon Sep 17 00:00:00 2001 From: huhu-dsy Date: Thu, 16 Oct 2025 01:47:35 +0800 Subject: [PATCH 05/24] Update readers.py --- pandas/io/parsers/readers.py | 56 +++++++++++------------------------- 1 file changed, 17 insertions(+), 39 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index a3f0849ece94d..38cb25285b438 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -112,8 +112,7 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): skipfooter: int nrows: int | None na_values: ( - Hashable | Iterable[Hashable] | Mapping[Hashable, - Iterable[Hashable]] | None + Hashable | Iterable[Hashable] | Mapping[Hashable, Iterable[Hashable]] | None ) keep_default_na: bool na_filter: bool @@ -578,10 +577,7 @@ class _Fwf_Defaults(TypedDict): widths: None -_fwf_defaults: _Fwf_Defaults = { - "colspecs": "infer", - "infer_nrows": 100, - "widths": None} +_fwf_defaults: _Fwf_Defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None} _c_unsupported = {"skipfooter"} _python_unsupported = {"low_memory", "float_precision"} _pyarrow_unsupported = { @@ -612,10 +608,7 @@ def validate_integer(name: str, val: float, min_val: int = ...) -> int: ... @overload -def validate_integer( - name: str, - val: int | None, - min_val: int = ...) -> int | None: ... +def validate_integer(name: str, val: int | None, min_val: int = ...) -> int | None: ... def validate_integer( @@ -669,9 +662,7 @@ def _validate_names(names: Sequence[Hashable] | None) -> None: if len(names) != len(set(names)): raise ValueError("Duplicate names are not allowed.") if not ( - is_list_like( - names, allow_sets=False) or isinstance( - names, abc.KeysView) + is_list_like(names, allow_sets=False) or isinstance(names, abc.KeysView) ): raise ValueError("Names should be an ordered collection.") @@ -790,8 +781,7 @@ def read_csv( nrows: int | None = None, # NA and Missing Data Handling na_values: ( - Hashable | Iterable[Hashable] | Mapping[Hashable, - Iterable[Hashable]] | None + Hashable | Iterable[Hashable] | Mapping[Hashable, Iterable[Hashable]] | None ) = None, keep_default_na: bool = True, na_filter: bool = True, @@ -1370,8 +1360,7 @@ def read_table( nrows: int | None = None, # NA and Missing Data Handling na_values: ( - Hashable | Iterable[Hashable] | Mapping[Hashable, - Iterable[Hashable]] | None + Hashable | Iterable[Hashable] | Mapping[Hashable, Iterable[Hashable]] | None ) = None, keep_default_na: bool = True, na_filter: bool = True, @@ -1985,8 +1974,7 @@ def read_fwf( if colspecs is None and widths is None: raise ValueError("Must specify either colspecs or widths") if colspecs not in (None, "infer") and widths is not None: - raise ValueError( - "You must specify only one of 'widths' and 'colspecs'") + raise ValueError("You must specify only one of 'widths' and 'colspecs'") # Compute 'colspecs' from 'widths', if specified. if widths is not None: @@ -2016,11 +2004,9 @@ def read_fwf( assert index_col is not lib.no_default len_index = len(index_col) - if kwds.get("usecols") is None and len( - names) + len_index != len(colspecs): + if kwds.get("usecols") is None and len(names) + len_index != len(colspecs): # If usecols is used colspec may be longer than names - raise ValueError( - "Length of colspecs must match length of names") + raise ValueError("Length of colspecs must match length of names") check_dtype_backend(kwds.setdefault("dtype_backend", lib.no_default)) return _read( @@ -2112,8 +2098,7 @@ def _get_options_with_defaults(self, engine: CSVEngine) -> dict[str, Any]: and value != getattr(value, "value", default) ): raise ValueError( - f"The { - argname!r} option is not supported with the 'pyarrow' engine" + f"The {argname!r} option is not supported with the 'pyarrow' engine" ) options[argname] = value @@ -2129,8 +2114,7 @@ def _get_options_with_defaults(self, engine: CSVEngine) -> dict[str, Any]: pass else: raise ValueError( - f"The { - argname!r} option is not supported with the " + f"The {argname!r} option is not supported with the " f"{engine!r} engine" ) else: @@ -2231,8 +2215,7 @@ def _clean_options( if "python" in engine: for arg in _python_unsupported: - if fallback_reason and result[arg] != _c_parser_defaults.get( - arg): + if fallback_reason and result[arg] != _c_parser_defaults.get(arg): raise ValueError( "Falling back to the 'python' engine because " f"{fallback_reason}, but this causes {arg!r} to be " @@ -2331,8 +2314,7 @@ def _make_engine( if engine not in mapping: raise ValueError( - f"Unknown engine: {engine} (valid options are { - mapping.keys()})" + f"Unknown engine: {engine} (valid options are {mapping.keys()})" ) if not isinstance(f, list): # open file here @@ -2436,8 +2418,7 @@ def read(self, nrows: int | None = None) -> DataFrame: if pandas_dtype(dtype[k]) in (np.str_, np.object_) else None ) - new_col_dict[k] = Series( - v, index=index, dtype=d, copy=False) + new_col_dict[k] = Series(v, index=index, dtype=d, copy=False) else: new_col_dict = col_dict @@ -2528,8 +2509,7 @@ def TextParser(*args, **kwds) -> TextFileReader: return TextFileReader(*args, **kwds) -def _clean_na_values(na_values, keep_default_na: bool = True, - floatify: bool = True): +def _clean_na_values(na_values, keep_default_na: bool = True, floatify: bool = True): na_fvalues: set | dict if na_values is None: if keep_default_na: @@ -2669,8 +2649,7 @@ def _refine_defaults_read( ) if delimiter and (sep is not lib.no_default): - raise ValueError( - "Specified a sep and a delimiter; you can only specify one.") + raise ValueError("Specified a sep and a delimiter; you can only specify one.") kwds["names"] = None if names is lib.no_default else names @@ -2711,8 +2690,7 @@ def _refine_defaults_read( ) kwds["on_bad_lines"] = on_bad_lines else: - raise ValueError( - f"Argument {on_bad_lines} is invalid for on_bad_lines") + raise ValueError(f"Argument {on_bad_lines} is invalid for on_bad_lines") check_dtype_backend(dtype_backend) From 204856d58c0c9c5c1c5fb218b4befaedd1187555 Mon Sep 17 00:00:00 2001 From: huhu-dsy Date: Thu, 16 Oct 2025 02:37:23 +0800 Subject: [PATCH 06/24] Update readers.py --- pandas/io/parsers/readers.py | 58 ++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 38cb25285b438..2eefed861b00b 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1196,55 +1196,55 @@ def read_csv( Examples -------- - >>> pd.read_csv("data.csv") # doctest: +SKIP - Name Value - 0 foo 1 - 1 bar 2 - 2 #baz 3 + >>> pd. >>> pd.read_csv("data.csv") # doctest: +SKIP + Name Value + 0 foo 1 + 1 bar 2 + 2 #baz 3 Index and header can be specified via the `index_col` and `header` arguments. >>> pd.read_csv("data.csv", header=None) # doctest: +SKIP - 0 1 - 0 Name Value - 1 foo 1 - 2 bar 2 - 3 #baz 3 + 0 1 + 0 Name Value + 1 foo 1 + 2 bar 2 + 3 #baz 3 >>> pd.read_csv("data.csv", index_col="Value") # doctest: +SKIP - Name + Name Value - 1 foo - 2 bar - 3 #baz + 1 foo + 2 bar + 3 #baz Column types are inferred but can be explicitly specified using the - dtype argument. + `dtype` argument. >>> pd.read_csv("data.csv", dtype={"Value": float}) # doctest: +SKIP - Name Value - 0 foo 1.0 - 1 bar 2.0 - 2 #baz 3.0 + Name Value + 0 foo 1.0 + 1 bar 2.0 + 2 #baz 3.0 True, False, and NA values, and thousands separators have defaults, but can be explicitly specified, too. Supply the values you would like as strings or lists of strings! >>> pd.read_csv("data.csv", na_values=["foo", "bar"]) # doctest: +SKIP - Name Value - 0 NaN 1 - 1 NaN 2 - 2 #baz 3 + Name Value + 0 NaN 1 + 1 NaN 2 + 2 #baz 3 Comment lines in the input file can be skipped using the `comment` argument. >>> pd.read_csv("data.csv", comment="#") # doctest: +SKIP - Name Value - 0 foo 1 - 1 bar 2 + Name Value + 0 foo 1 + 1 bar 2 By default, columns with dates will be read as ``object`` rather than ``datetime``. @@ -1252,9 +1252,9 @@ def read_csv( >>> df = pd.read_csv("tmp.csv") # doctest: +SKIP >>> df # doctest: +SKIP - col 1 col 2 col 3 - 0 10 10/04/2018 Sun 15 Jan 2023 - 1 20 15/04/2018 Fri 12 May 2023 + col 1 col 2 col 3 + 0 10 10/04/2018 Sun 15 Jan 2023 + 1 20 15/04/2018 Fri 12 May 2023 >>> df.dtypes # doctest: +SKIP col 1 int64 From c1b7dc34efc55020bfa88f9a69cf34a94eb51cc9 Mon Sep 17 00:00:00 2001 From: huhu-dsy Date: Thu, 16 Oct 2025 14:03:58 +0800 Subject: [PATCH 07/24] Update readers.py --- pandas/io/parsers/readers.py | 67 ++++++++++++++---------------------- 1 file changed, 25 insertions(+), 42 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 2eefed861b00b..484ecd7a6be70 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -12,6 +12,7 @@ ) import csv import sys +from textwrap import fill from typing import ( IO, TYPE_CHECKING, @@ -35,6 +36,7 @@ ParserWarning, ) from pandas.util._decorators import ( + Appender, set_module, ) from pandas.util._exceptions import find_stack_level @@ -51,6 +53,7 @@ from pandas import Series from pandas.core.frame import DataFrame from pandas.core.indexes.api import RangeIndex +from pandas.core.shared_docs import _shared_docs from pandas.io.common import ( IOHandles, @@ -910,7 +913,7 @@ def read_csv( If ``converters`` are specified, they will be applied INSTEAD of ``dtype`` conversion. - .. versionadded:: 1.5.0 + versionadded:: 1.5.0 Support for ``defaultdict`` was added. Specify a ``defaultdict`` as input where the default determines the ``dtype`` of the columns which are not explicitly @@ -922,7 +925,7 @@ def read_csv( Multithreading is currently only supported by the pyarrow engine. - .. versionadded:: 1.4.0 + versionadded:: 1.4.0 The 'pyarrow' engine was added as an *experimental* engine, and some features are unsupported, or may not work correctly, with this engine. @@ -1022,7 +1025,7 @@ def read_csv( - "mixed", to infer the format for each element individually. This is risky, and you should probably use it along with `dayfirst`. - .. versionadded:: 2.0.0 + versionadded:: 2.0.0 dayfirst : bool, default False DD/MM format dates, international and European format. cache_dates : bool, default True @@ -1056,10 +1059,10 @@ def read_csv( decompression using a custom compression dictionary: ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``. - .. versionadded:: 1.5.0 + versionadded:: 1.5.0 Added support for `.tar` files. - .. versionchanged:: 1.4.0 Zstandard support. + versionchanged:: 1.4.0 Zstandard support. thousands : str (length 1), optional Character acting as the thousands separator in numerical values. decimal : str (length 1), default '.' @@ -1101,7 +1104,7 @@ def read_csv( How encoding errors are treated. `List of possible values `_. - .. versionadded:: 1.3.0 + versionadded:: 1.3.0 dialect : str or csv.Dialect, optional If provided, this parameter will override values (default or not) for the following parameters: ``delimiter``, ``doublequote``, @@ -1132,12 +1135,12 @@ def read_csv( `_. - .. versionadded:: 1.3.0 + versionadded:: 1.3.0 - .. versionadded:: 1.4.0 + versionadded:: 1.4.0 Callable - .. versionchanged:: 2.2.0 + versionchanged:: 2.2.0 Callable for ``engine='pyarrow'`` low_memory : bool, default True Internally process the file in chunks, resulting in lower memory @@ -1179,7 +1182,7 @@ def read_csv( * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` :class:`DataFrame` - .. versionadded:: 2.0 + versionadded:: 2.0 Returns ------- @@ -1489,7 +1492,7 @@ def read_table( If ``converters`` are specified, they will be applied INSTEAD of ``dtype`` conversion. - .. versionadded:: 1.5.0 + versionadded:: 1.5.0 Support for ``defaultdict`` was added. Specify a ``defaultdict`` as input where the default determines the ``dtype`` of the columns which are not explicitly listed. @@ -1498,7 +1501,7 @@ def read_table( the python engine is currently more feature-complete. Multithreading is currently only supported by the pyarrow engine. - .. versionadded:: 1.4.0 + versionadded:: 1.4.0 The 'pyarrow' engine was added as an *experimental* engine, and some features are unsupported, or may not work correctly, with this engine. @@ -1602,7 +1605,7 @@ def read_table( - "mixed", to infer the format for each element individually. This is risky, and you should probably use it along with `dayfirst`. - .. versionadded:: 2.0.0 + versionadded:: 2.0.0 dayfirst : bool, default False DD/MM format dates, international and European format. cache_dates : bool, default True @@ -1635,10 +1638,10 @@ def read_table( decompression using a custom compression dictionary: ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``. - .. versionadded:: 1.5.0 + versionadded:: 1.5.0 Added support for `.tar` files. - .. versionchanged:: 1.4.0 Zstandard support. + versionchanged:: 1.4.0 Zstandard support. thousands : str (length 1), optional Character acting as the thousands separator in numerical values. decimal : str (length 1), default '.' @@ -1679,7 +1682,7 @@ def read_table( How encoding errors are treated. `List of possible values `_. - .. versionadded:: 1.3.0 + versionadded:: 1.3.0 dialect : str or csv.Dialect, optional If provided, this parameter will override values (default or not) for the following parameters: ``delimiter``, ``doublequote``, @@ -1708,12 +1711,12 @@ def read_table( `_. - .. versionadded:: 1.3.0 + versionadded:: 1.3.0 - .. versionadded:: 1.4.0 + versionadded:: 1.4.0 Callable - .. versionchanged:: 2.2.0 + versionchanged:: 2.2.0 Callable for ``engine='pyarrow'`` low_memory : bool, default True Internally process the file in chunks, resulting in lower memory use @@ -1754,7 +1757,7 @@ def read_table( * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` :class:`DataFrame` - .. versionadded:: 2.0 + versionadded:: 2.0 Returns ------- @@ -1848,24 +1851,6 @@ def read_table( col 3 datetime64[ns] dtype: object """ - # locals() should never be modified - kwds = locals().copy() - del kwds["filepath_or_buffer"] - del kwds["sep"] - - kwds_defaults = _refine_defaults_read( - dialect, - delimiter, - engine, - sep, - on_bad_lines, - names, - defaults={"delimiter": "\t"}, - dtype_backend=dtype_backend, - ) - kwds.update(kwds_defaults) - - return _read(filepath_or_buffer, kwds) @overload @@ -2399,8 +2384,7 @@ def read(self, nrows: int | None = None) -> DataFrame: dtype_arg = None if isinstance(dtype_arg, dict): - # type: ignore[var-annotated] - dtype = defaultdict(lambda: None) + dtype = defaultdict(lambda: None) # type: ignore[var-annotated] dtype.update(dtype_arg) elif dtype_arg is not None and pandas_dtype(dtype_arg) in ( np.str_, @@ -2712,8 +2696,7 @@ def _extract_dialect(kwds: dict[str, str | csv.Dialect]) -> csv.Dialect | None: dialect = kwds["dialect"] if isinstance(dialect, str) and dialect in csv.list_dialects(): - # get_dialect is typed to return a `_csv.Dialect` for some reason in - # typeshed + # get_dialect is typed to return a `_csv.Dialect` for some reason in typeshed tdialect = cast(csv.Dialect, csv.get_dialect(dialect)) _validate_dialect(tdialect) From ce00de9eafd167d401a7fffb2ea97e57ade02e62 Mon Sep 17 00:00:00 2001 From: huhu-dsy Date: Thu, 16 Oct 2025 14:39:12 +0800 Subject: [PATCH 08/24] Update readers.py --- pandas/io/parsers/readers.py | 1926 ++++++++++++++++++---------------- 1 file changed, 1017 insertions(+), 909 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 484ecd7a6be70..7d982d6798730 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -12,7 +12,6 @@ ) import csv import sys -from textwrap import fill from typing import ( IO, TYPE_CHECKING, @@ -36,7 +35,6 @@ ParserWarning, ) from pandas.util._decorators import ( - Appender, set_module, ) from pandas.util._exceptions import find_stack_level @@ -53,7 +51,6 @@ from pandas import Series from pandas.core.frame import DataFrame from pandas.core.indexes.api import RangeIndex -from pandas.core.shared_docs import _shared_docs from pandas.io.common import ( IOHandles, @@ -820,466 +817,511 @@ def read_csv( dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, ) -> DataFrame | TextFileReader: r""" - Read a comma-separated values (csv) file into DataFrame. - - Also supports optionally iterating or breaking of the file - into chunks. - - Additional help can be found in the online docs for - `IO Tools `_. - - Parameters - ---------- - filepath_or_buffer : str, path object or file-like object - Any valid string path is acceptable. The string could be a URL. Valid - URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is - expected. A local file could be: file://localhost/path/to/table.csv. - - If you want to pass in a path object, pandas accepts any ``os.PathLike``. - - By file-like object, we refer to objects with a ``read()`` method, such as - a file handle (e.g. via builtin ``open`` function) or ``StringIO``. - sep : str, default ',' - Character or regex pattern to treat as the delimiter. If ``sep=None``, the - C engine cannot automatically detect - the separator, but the Python parsing engine can, meaning the latter will - be used and automatically detect the separator from only the first valid - row of the file by Python's builtin sniffer tool, ``csv.Sniffer``. - In addition, separators longer than 1 character and different from - ``'\s+'`` will be interpreted as regular expressions and will also force - the use of the Python parsing engine. Note that regex delimiters are prone - to ignoring quoted data. Regex example: ``'\r\t'``. - delimiter : str, optional - Alias for ``sep``. - header : int, Sequence of int, 'infer' or None, default 'infer' - Row number(s) containing column labels and marking the start of the - data (zero-indexed). Default behavior is to infer the column names: if no - ``names`` - are passed the behavior is identical to ``header=0`` and column - names are inferred from the first line of the file, if column - names are passed explicitly to ``names`` then the behavior is identical to - ``header=None``. Explicitly pass ``header=0`` to be able to - replace existing names. The header can be a list of integers that - specify row locations for a :class:`~pandas.MultiIndex` on the columns - e.g. ``[0, 1, 3]``. Intervening rows that are not specified will be - skipped (e.g. 2 in this example is skipped). Note that this - parameter ignores commented lines and empty lines if - ``skip_blank_lines=True``, so ``header=0`` denotes the first line of - data rather than the first line of the file. - - When inferred from the file contents, headers are kept distinct from - each other by renaming duplicate names with a numeric suffix of the form - ``".{count}"`` starting from 1, e.g. ``"foo"`` and ``"foo.1"``. - Empty headers are named ``"Unnamed: {i}"`` or ``"Unnamed: {i}_level_{level}"`` - in the case of MultiIndex columns. - names : Sequence of Hashable, optional - Sequence of column labels to apply. If the file contains a header row, - then you should explicitly pass ``header=0`` to override the column names. - Duplicates in this list are not allowed. - index_col : Hashable, Sequence of Hashable or False, optional - Column(s) to use as row label(s), denoted either by column labels or column - indices. If a sequence of labels or indices is given, - :class:`~pandas.MultiIndex` - will be formed for the row labels. - - Note: ``index_col=False`` can be used to force pandas to *not* use the first - column as the index, e.g., when you have a malformed file with delimiters at - the end of each line. - usecols : Sequence of Hashable or Callable, optional - Subset of columns to select, denoted either by column labels or column indices. - If list-like, all elements must either - be positional (i.e. integer indices into the document columns) or strings - that correspond to column names provided either by the user in ``names`` or - inferred from the document header row(s). If ``names`` are given, the document - header row(s) are not taken into account. For example, a valid list-like - ``usecols`` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``. - Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. - To instantiate a :class:`~pandas.DataFrame` from ``data`` with element order - preserved use ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` - for columns in ``['foo', 'bar']`` order or - ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]`` - for ``['bar', 'foo']`` order. - - If callable, the callable function will be evaluated against the column - names, returning names where the callable function evaluates to ``True``. An - example of a valid callable argument would be ``lambda x: x.upper() in - ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster - parsing time and lower memory usage. - dtype : dtype or dict of {Hashable : dtype}, optional - Data type(s) to apply to either the whole dataset or individual columns. - E.g., ``{'a': np.float64, 'b': np.int32, 'c': 'Int64'}`` - Use ``str`` or ``object`` together with suitable ``na_values`` settings - to preserve and not interpret ``dtype``. - If ``converters`` are specified, they will be applied INSTEAD - of ``dtype`` conversion. - - versionadded:: 1.5.0 - Support for ``defaultdict`` was added. Specify a ``defaultdict`` as input - where - the default determines the ``dtype`` of the columns which are not explicitly - listed. - engine : {'c', 'python', 'pyarrow'}, optional - Parser engine to use. The C and pyarrow engines are faster, - while the python engine - is currently more feature-complete. - Multithreading is currently only supported by - the pyarrow engine. - - versionadded:: 1.4.0 - The 'pyarrow' engine was added as an *experimental* engine, - and some features - are unsupported, or may not work correctly, with this engine. - converters : dict of {Hashable : Callable}, optional - Functions for converting values in specified columns. Keys can either - be column labels or column indices. - true_values : list, optional - Values to consider as ``True`` - in addition to case-insensitive variants of 'True'. - false_values : list, optional - Values to consider as ``False`` - in addition to case-insensitive variants of 'False'. - skipinitialspace : bool, default False - Skip spaces after delimiter. - skiprows : int, list of int or Callable, optional - Line numbers to skip (0-indexed) or number of lines to skip (``int``) - at the start of the file. - - If callable, the callable function will be evaluated against the row - indices, returning ``True`` if the row - should be skipped and ``False`` otherwise. - An example of a valid callable argument would be ``lambda x: x in [0, 2]``. - skipfooter : int, default 0 - Number of lines at bottom of file to skip (Unsupported with ``engine='c'``). - nrows : int, optional - Number of rows of file to read. Useful for reading pieces of large files. - Refers to the number of data rows in the returned DataFrame, excluding: - - * The header row containing column names. - * Rows before the header row, if ``header=1`` or larger. - - Example usage: - - * To read the first 999,999 (non-header) rows: - ``read_csv(..., nrows=999999)`` - - * To read rows 1,000,000 through 1,999,999: - ``read_csv(..., skiprows=1000000, nrows=999999)`` - na_values : Hashable, Iterable of Hashable or dict of {Hashable : Iterable}, - optional - Additional strings to recognize as ``NA``/``NaN``. If ``dict`` passed, specific - per-column ``NA`` values. By default the following values are interpreted as - ``NaN``: "", "#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN", - "-nan", "1.#IND", "1.#QNAN", "", "N/A", "NA", "NULL", "NaN", - "None", "n/a", "nan", "null". - keep_default_na : bool, default True - Whether or not to include the default ``NaN`` values when parsing the data. - Depending on whether ``na_values`` is passed in, the behavior is as follows: - - * If ``keep_default_na`` is ``True``, and ``na_values`` are specified, - ``na_values`` - is appended to the default ``NaN`` values used for parsing. - * If ``keep_default_na`` is ``True``, and ``na_values`` are not specified, only - the default ``NaN`` values are used for parsing. - * If ``keep_default_na`` is ``False``, and ``na_values`` are specified, only - the ``NaN`` values specified ``na_values`` are used for parsing. - * If ``keep_default_na`` is ``False``, and ``na_values`` are not specified, no - strings will be parsed as ``NaN``. - - Note that if ``na_filter`` is passed in as ``False``, - the ``keep_default_na`` and - ``na_values`` parameters will be ignored. - na_filter : bool, default True - Detect missing value markers (empty strings and the value of ``na_values``). In - data without any ``NA`` values, passing ``na_filter=False`` can improve the - performance of reading a large file. - skip_blank_lines : bool, default True - If ``True``, skip over blank lines rather than interpreting as ``NaN`` values. - parse_dates : bool, None, list of Hashable, default None - The behavior is as follows: - - * ``bool``. If ``True`` -> try parsing the index. - * ``None``. Behaves like ``True`` if ``date_format`` is specified. - * ``list`` of ``int`` or names. e.g. If ``[1, 2, 3]`` -> try parsing - columns 1, 2, 3 - each as a separate date column. - - If a column or index cannot be represented as an array of ``datetime``, - say because of an unparsable value or a mixture of timezones, the column - or index will be returned unaltered as an ``object`` data type. For - non-standard ``datetime`` parsing, use :func:`~pandas.to_datetime` after - :func:`~pandas.read_csv`. - - Note: A fast-path exists for iso8601-formatted dates. - date_format : str or dict of column -> format, optional - Format to use for parsing dates and/or times - when used in conjunction with ``parse_dates``. - The strftime to parse time, e.g. :const:`"%d/%m/%Y"`. See - `strftime documentation - `_ for more information on choices, though - note that :const:`"%f"` will parse all the way up to nanoseconds. - You can also pass: - - - "ISO8601", to parse any `ISO8601 `_ - time string (not necessarily in exactly the same format); - - "mixed", to infer the format for each element individually. This is risky, - and you should probably use it along with `dayfirst`. - - versionadded:: 2.0.0 - dayfirst : bool, default False - DD/MM format dates, international and European format. - cache_dates : bool, default True - If ``True``, use a cache of unique, converted dates to apply the ``datetime`` - conversion. May produce significant speed-up when parsing duplicate - date strings, especially ones with timezone offsets. - iterator : bool, default False - Return ``TextFileReader`` object for iteration or getting chunks with - ``get_chunk()``. - chunksize : int, optional - Number of lines to read from the file per chunk. Passing a value will cause the - function to return a ``TextFileReader`` object for iteration. - See the `IO Tools docs - `_ - for more information on ``iterator`` and ``chunksize``. - compression : str or dict, default 'infer' - For on-the-fly decompression of on-disk data. If 'infer' and - 'filepath_or_buffer' is - path-like, then detect compression from the following extensions: '.gz', - '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or ' - .tar.bz2' (otherwise no compression). - If using 'zip' or 'tar', the ZIP file must contain only one data - file to be read in. - Set to ``None`` for no decompression. - Can also be a dict with key ``'method'`` set to one of {``'zip'``, - ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``} and other - key-value pairs are forwarded to ``zipfile.ZipFile``, - ``gzip.GzipFile``, ``bz2.BZ2File``, ``zstandard.ZstdDecompressor``, - ``lzma.LZMAFile`` or ``tarfile.TarFile``, respectively. - As an example, the following could be passed for Zstandard - decompression using a custom compression dictionary: - ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``. - - versionadded:: 1.5.0 - Added support for `.tar` files. - - versionchanged:: 1.4.0 Zstandard support. - thousands : str (length 1), optional - Character acting as the thousands separator in numerical values. - decimal : str (length 1), default '.' - Character to recognize as decimal point (e.g., use ',' for European - data). - lineterminator : str (length 1), optional - Character used to denote a line break. Only valid with C parser. - quotechar : str (length 1), optional - Character used to denote the start and end of a quoted item. Quoted - items can include the ``delimiter`` and it will be ignored. - quoting : {0 or csv.QUOTE_MINIMAL, 1 or csv.QUOTE_ALL, 2 or - csv.QUOTE_NONNUMERIC, 3 or csv.QUOTE_NONE}, default - csv.QUOTE_MINIMAL - Control field quoting behavior per ``csv.QUOTE_*`` constants. - Default is ``csv.QUOTE_MINIMAL`` (i.e., 0) which implies that only - fields containing special characters are quoted (e.g., characters - defined in ``quotechar``, ``delimiter``, or ``lineterminator``. - doublequote : bool, default True - When ``quotechar`` is specified and ``quoting`` is not - ``QUOTE_NONE``, indicate whether or not to interpret two - consecutive ``quotechar`` elements INSIDE a field as a single - ``quotechar`` element. - escapechar : str (length 1), optional - Character used to escape other characters. - comment : str (length 1), optional - Character indicating that the remainder of line should not be - parsed. If found at the beginning of a line, the line will be - ignored altogether. This parameter must be a single character. - Like empty lines (as long as ``skip_blank_lines=True``), fully - commented lines are ignored by the parameter ``header`` but not by - ``skiprows``. For example, if ``comment='#'``, parsing - ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in - ``'a,b,c'`` being treated as the header. - encoding : str, optional, default 'utf-8' - Encoding to use for UTF when reading/writing (ex. ``'utf-8'``). - `List of Python standard encodings - `_. - encoding_errors : str, optional, default 'strict' - How encoding errors are treated. `List of possible values - `_. - - versionadded:: 1.3.0 - dialect : str or csv.Dialect, optional - If provided, this parameter will override values (default or not) - for the following parameters: ``delimiter``, ``doublequote``, - ``escapechar``, ``skipinitialspace``, ``quotechar``, and - ``quoting``. If it is necessary to override values, a - ``ParserWarning`` will be issued. See ``csv.Dialect`` - documentation for more details. - on_bad_lines : {'error', 'warn', 'skip'} or Callable, default 'error' - Specifies what to do upon encountering a bad line (a line with too - many fields). Allowed values are: - - - ``'error'``, raise an Exception when a bad line is encountered. - - ``'warn'``, raise a warning when a bad line is encountered and - skip that line. - - ``'skip'``, skip bad lines without raising or warning when they - are encountered. - - Callable, function that will process a single bad line. - - With ``engine='python'``, function with signature - ``(bad_line: list[str]) -> list[str] | None``. - ``bad_line`` is a list of strings split by the ``sep``. - If the function returns ``None``, the bad line will be - ignored. - If the function returns a new ``list`` of strings with more - elements than expected, a ``ParserWarning`` will be emitted - while dropping extra elements. - - With ``engine='pyarrow'``, function with signature as - described in pyarrow documentation: `invalid_row_handler - `_. - - versionadded:: 1.3.0 - - versionadded:: 1.4.0 - Callable - - versionchanged:: 2.2.0 - Callable for ``engine='pyarrow'`` - low_memory : bool, default True - Internally process the file in chunks, resulting in lower memory - use while parsing, but possibly mixed type inference. To ensure - no mixed types either set ``False``, or specify the type with the - ``dtype`` parameter. Note that the entire file is read into a - single :class:`~pandas.DataFrame` regardless, use the - ``chunksize`` or ``iterator`` parameter to return the data in - chunks. (Only valid with C parser). - memory_map : bool, default False - If a filepath is provided for ``filepath_or_buffer``, map the file - object directly onto memory and access the data directly from - there. Using this option can improve performance because there is - no longer any I/O overhead. - float_precision : {'high', 'legacy', 'round_trip'}, optional - Specifies which converter the C engine should use for - floating-point values. The options are ``None`` or ``'high'`` for - the ordinary converter, ``'legacy'`` for the original lower - precision pandas converter, and ``'round_trip'`` for the - round-trip converter. - storage_options : dict, optional - Extra options that make sense for a particular storage connection, - e.g. host, port, username, password, etc. For HTTP(S) URLs the - key-value pairs are forwarded to ``urllib.request.Request`` as - header options. For other URLs (e.g. starting with "s3://", and - "gcs://") the key-value pairs are forwarded to ``fsspec.open``. - Please see ``fsspec`` and ``urllib`` for more details, and for - more examples on storage options refer `here - `_. - dtype_backend : {'numpy_nullable', 'pyarrow'} - Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). If not specified, the default behavior is - to not use nullable data types. If specified, the behavior is as - follows: - - * ``"numpy_nullable"``: returns nullable-dtype-backed - :class:`DataFrame` - * ``"pyarrow"``: returns pyarrow-backed nullable - :class:`ArrowDtype` :class:`DataFrame` - - versionadded:: 2.0 - - Returns - ------- - DataFrame or TextFileReader - A comma-separated values (csv) file is returned as two-dimensional - data structure with labeled axes. - - See Also - -------- - DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) - file. - read_table : Read general delimited file into DataFrame. - read_fwf : Read a table of fixed-width formatted lines into DataFrame. - - Examples - -------- - >>> pd. >>> pd.read_csv("data.csv") # doctest: +SKIP - Name Value - 0 foo 1 - 1 bar 2 - 2 #baz 3 - - Index and header can be specified via the `index_col` and `header` - arguments. - - >>> pd.read_csv("data.csv", header=None) # doctest: +SKIP - 0 1 - 0 Name Value - 1 foo 1 - 2 bar 2 - 3 #baz 3 - - >>> pd.read_csv("data.csv", index_col="Value") # doctest: +SKIP - Name - Value - 1 foo - 2 bar - 3 #baz - - Column types are inferred but can be explicitly specified using the - `dtype` argument. - - >>> pd.read_csv("data.csv", dtype={"Value": float}) # doctest: +SKIP - Name Value - 0 foo 1.0 - 1 bar 2.0 - 2 #baz 3.0 - - True, False, and NA values, and thousands separators have defaults, - but can be explicitly specified, too. Supply the values you would like - as strings or lists of strings! - - >>> pd.read_csv("data.csv", na_values=["foo", "bar"]) # doctest: +SKIP - Name Value - 0 NaN 1 - 1 NaN 2 - 2 #baz 3 - - Comment lines in the input file can be skipped using the `comment` - argument. - - >>> pd.read_csv("data.csv", comment="#") # doctest: +SKIP - Name Value - 0 foo 1 - 1 bar 2 - - By default, columns with dates will be read as ``object`` rather than - ``datetime``. - - >>> df = pd.read_csv("tmp.csv") # doctest: +SKIP - - >>> df # doctest: +SKIP - col 1 col 2 col 3 - 0 10 10/04/2018 Sun 15 Jan 2023 - 1 20 15/04/2018 Fri 12 May 2023 - - >>> df.dtypes # doctest: +SKIP - col 1 int64 - col 2 object - col 3 object - dtype: object - - Specific columns can be parsed as dates by using the `parse_dates` and - `date_format` arguments. - - >>> df = pd.read_csv( - ... "tmp.csv", - ... parse_dates=[1, 2], - ... date_format={"col 2": "%d/%m/%Y", "col 3": "%a %d %b %Y"}, - ... ) # doctest: +SKIP - - >>> df.dtypes # doctest: +SKIP - col 1 int64 - col 2 datetime64[ns] - col 3 datetime64[ns] - dtype: object - """ + Read a comma-separated values (csv) file into DataFrame. + + Also supports optionally iterating or breaking of the file + into chunks. + + Additional help can be found in the online docs for + `IO Tools `_. + + Parameters + ---------- + + filepath_or_buffer : str, path object or file-like object + Any valid string path is acceptable. The string could be a URL. Valid + URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is + expected. A local file could be: file://localhost/path/to/table.csv. + + If you want to pass in a path object, pandas accepts any ``os.PathLike``. + + By file-like object, we refer to objects with a ``read()`` method, such as + a file handle (e.g. via builtin ``open`` function) or ``StringIO``. + + sep : str, default ',' + Character or regex pattern to treat as the delimiter. If ``sep=None``, the + C engine cannot automatically detect + the separator, but the Python parsing engine can, meaning the latter will + be used and automatically detect the separator from only the first valid + row of the file by Python's builtin sniffer tool, ``csv.Sniffer``. + In addition, separators longer than 1 character and different from + ``'\s+'`` will be interpreted as regular expressions and will also force + the use of the Python parsing engine. Note that regex delimiters are prone + to ignoring quoted data. Regex example: ``'\r\t'``. + + delimiter : str, optional + Alias for ``sep``. + + header : int, Sequence of int, 'infer' or None, default 'infer' + Row number(s) containing column labels and marking the start of the + data (zero-indexed). Default behavior is to infer the column names: if no + ``names`` + are passed the behavior is identical to ``header=0`` and column + names are inferred from the first line of the file, if column + names are passed explicitly to ``names`` then the behavior is identical to + ``header=None``. Explicitly pass ``header=0`` to be able to + replace existing names. The header can be a list of integers that + specify row locations for a :class:`~pandas.MultiIndex` on the columns + e.g. ``[0, 1, 3]``. Intervening rows that are not specified will be + skipped (e.g. 2 in this example is skipped). Note that this + parameter ignores commented lines and empty lines if + ``skip_blank_lines=True``, so ``header=0`` denotes the first line of + data rather than the first line of the file. + + When inferred from the file contents, headers are kept distinct from + each other by renaming duplicate names with a numeric suffix of the form + ``".{count}"`` starting from 1, e.g. ``"foo"`` and ``"foo.1"``. + Empty headers are named ``"Unnamed: {i}"`` or ``"Unnamed: {i}_level_{level}"`` + in the case of MultiIndex columns. + + names : Sequence of Hashable, optional + Sequence of column labels to apply. If the file contains a header row, + then you should explicitly pass ``header=0`` to override the column names. + Duplicates in this list are not allowed. + + index_col : Hashable, Sequence of Hashable or False, optional + Column(s) to use as row label(s), denoted either by column labels or column + indices. If a sequence of labels or indices is given, + :class:`~pandas.MultiIndex` + will be formed for the row labels. + + Note: ``index_col=False`` can be used to force pandas to *not* use the first + column as the index, e.g., when you have a malformed file with delimiters at + the end of each line. + + usecols : Sequence of Hashable or Callable, optional + Subset of columns to select, denoted either by column labels or column indices. + If list-like, all elements must either + be positional (i.e. integer indices into the document columns) or strings + that correspond to column names provided either by the user in ``names`` or + inferred from the document header row(s). If ``names`` are given, the document + header row(s) are not taken into account. For example, a valid list-like + ``usecols`` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``. + Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. + To instantiate a :class:`~pandas.DataFrame` from ``data`` with element order + preserved use ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` + for columns in ``['foo', 'bar']`` order or + ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]`` + for ``['bar', 'foo']`` order. + + If callable, the callable function will be evaluated against the column + names, returning names where the callable function evaluates to ``True``. An + example of a valid callable argument would be ``lambda x: x.upper() in + ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster + parsing time and lower memory usage. + + dtype : dtype or dict of {Hashable : dtype}, optional + Data type(s) to apply to either the whole dataset or individual columns. + E.g., ``{'a': np.float64, 'b': np.int32, 'c': 'Int64'}`` + Use ``str`` or ``object`` together with suitable ``na_values`` settings + to preserve and not interpret ``dtype``. + If ``converters`` are specified, they will be applied INSTEAD + of ``dtype`` conversion. + + versionadded:: 1.5.0 + Support for ``defaultdict`` was added. Specify a ``defaultdict`` as input + where + the default determines the ``dtype`` of the columns which are not explicitly + listed. + + engine : {'c', 'python', 'pyarrow'}, optional + Parser engine to use. The C and pyarrow engines are faster, + while the python engine + is currently more feature-complete. + Multithreading is currently only supported by + the pyarrow engine. + + versionadded:: 1.4.0 + The 'pyarrow' engine was added as an *experimental* engine, + and some features + are unsupported, or may not work correctly, with this engine. + + converters : dict of {Hashable : Callable}, optional + Functions for converting values in specified columns. Keys can either + be column labels or column indices. + + true_values : list, optional + Values to consider as ``True`` + in addition to case-insensitive variants of 'True'. + + false_values : list, optional + Values to consider as ``False`` + in addition to case-insensitive variants of 'False'. + + skipinitialspace : bool, default False + Skip spaces after delimiter. + + skiprows : int, list of int or Callable, optional + Line numbers to skip (0-indexed) or number of lines to skip (``int``) + at the start of the file. + + If callable, the callable function will be evaluated against the row + indices, returning ``True`` if the row + should be skipped and ``False`` otherwise. + An example of a valid callable argument would be ``lambda x: x in [0, 2]``. + + skipfooter : int, default 0 + Number of lines at bottom of file to skip (Unsupported with ``engine='c'``). + + nrows : int, optional + Number of rows of file to read. Useful for reading pieces of large files. + Refers to the number of data rows in the returned DataFrame, excluding: + + * The header row containing column names. + * Rows before the header row, if ``header=1`` or larger. + + Example usage: + + * To read the first 999,999 (non-header) rows: + ``read_csv(..., nrows=999999)`` + + * To read rows 1,000,000 through 1,999,999: + ``read_csv(..., skiprows=1000000, nrows=999999)`` + + na_values : Hashable, Iterable of Hashable or dict of {Hashable : Iterable}, + optional + Additional strings to recognize as ``NA``/``NaN``. If ``dict`` passed, specific + per-column ``NA`` values. By default the following values are interpreted as + ``NaN``: "", "#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN", + "-nan", "1.#IND", "1.#QNAN", "", "N/A", "NA", "NULL", "NaN", + "None", "n/a", "nan", "null". + + keep_default_na : bool, default True + Whether or not to include the default ``NaN`` values when parsing the data. + Depending on whether ``na_values`` is passed in, the behavior is as follows: + + * If ``keep_default_na`` is ``True``, and ``na_values`` are specified, + ``na_values`` + is appended to the default ``NaN`` values used for parsing. + * If ``keep_default_na`` is ``True``, and ``na_values`` are not specified, only + the default ``NaN`` values are used for parsing. + * If ``keep_default_na`` is ``False``, and ``na_values`` are specified, only + the ``NaN`` values specified ``na_values`` are used for parsing. + * If ``keep_default_na`` is ``False``, and ``na_values`` are not specified, no + strings will be parsed as ``NaN``. + + Note that if ``na_filter`` is passed in as ``False``, + the ``keep_default_na`` and + ``na_values`` parameters will be ignored. + + na_filter : bool, default True + Detect missing value markers (empty strings and the value of ``na_values``). In + data without any ``NA`` values, passing ``na_filter=False`` can improve the + performance of reading a large file. + + skip_blank_lines : bool, default True + If ``True``, skip over blank lines rather than interpreting as ``NaN`` values. + + parse_dates : bool, None, list of Hashable, default None + The behavior is as follows: + + * ``bool``. If ``True`` -> try parsing the index. + * ``None``. Behaves like ``True`` if ``date_format`` is specified. + * ``list`` of ``int`` or names. e.g. If ``[1, 2, 3]`` -> try parsing + columns 1, 2, 3 + each as a separate date column. + + If a column or index cannot be represented as an array of ``datetime``, + say because of an unparsable value or a mixture of timezones, the column + or index will be returned unaltered as an ``object`` data type. For + non-standard ``datetime`` parsing, use :func:`~pandas.to_datetime` after + :func:`~pandas.read_csv`. + + Note: A fast-path exists for iso8601-formatted dates. + + date_format : str or dict of column -> format, optional + Format to use for parsing dates and/or times + when used in conjunction with ``parse_dates``. + The strftime to parse time, e.g. :const:`"%d/%m/%Y"`. See + `strftime documentation + `_ for more information on choices, though + note that :const:`"%f"` will parse all the way up to nanoseconds. + You can also pass: + + - "ISO8601", to parse any `ISO8601 `_ + time string (not necessarily in exactly the same format); + - "mixed", to infer the format for each element individually. This is risky, + and you should probably use it along with `dayfirst`. + + versionadded:: 2.0.0 + + dayfirst : bool, default False + DD/MM format dates, international and European format. + + cache_dates : bool, default True + If ``True``, use a cache of unique, converted dates to apply the ``datetime`` + conversion. May produce significant speed-up when parsing duplicate + date strings, especially ones with timezone offsets. + + iterator : bool, default False + Return ``TextFileReader`` object for iteration or getting chunks with + ``get_chunk()``. + + chunksize : int, optional + Number of lines to read from the file per chunk. Passing a value will cause the + function to return a ``TextFileReader`` object for iteration. + See the `IO Tools docs + `_ + for more information on ``iterator`` and ``chunksize``. + + compression : str or dict, default 'infer' + For on-the-fly decompression of on-disk data. If 'infer' and + 'filepath_or_buffer' is + path-like, then detect compression from the following extensions: '.gz', + '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or ' + .tar.bz2' (otherwise no compression). + If using 'zip' or 'tar', the ZIP file must contain only one data + file to be read in. + Set to ``None`` for no decompression. + Can also be a dict with key ``'method'`` set to one of {``'zip'``, + ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``} and other + key-value pairs are forwarded to ``zipfile.ZipFile``, + ``gzip.GzipFile``, ``bz2.BZ2File``, ``zstandard.ZstdDecompressor``, + ``lzma.LZMAFile`` or ``tarfile.TarFile``, respectively. + As an example, the following could be passed for Zstandard + decompression using a custom compression dictionary: + ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``. + + versionadded:: 1.5.0 + Added support for `.tar` files. + + versionchanged:: 1.4.0 Zstandard support. + + thousands : str (length 1), optional + Character acting as the thousands separator in numerical values. + + decimal : str (length 1), default '.' + Character to recognize as decimal point (e.g., use ',' for European + data). + + lineterminator : str (length 1), optional + Character used to denote a line break. Only valid with C parser. + + quotechar : str (length 1), optional + Character used to denote the start and end of a quoted item. Quoted + items can include the ``delimiter`` and it will be ignored. + + quoting : {0 or csv.QUOTE_MINIMAL, 1 or csv.QUOTE_ALL, 2 or + csv.QUOTE_NONNUMERIC, 3 or csv.QUOTE_NONE}, default + csv.QUOTE_MINIMAL + Control field quoting behavior per ``csv.QUOTE_*`` constants. + Default is ``csv.QUOTE_MINIMAL`` (i.e., 0) which implies that only + fields containing special characters are quoted (e.g., characters + defined in ``quotechar``, ``delimiter``, or ``lineterminator``. + + doublequote : bool, default True + When ``quotechar`` is specified and ``quoting`` is not + ``QUOTE_NONE``, indicate whether or not to interpret two + consecutive ``quotechar`` elements INSIDE a field as a single + ``quotechar`` element. + + escapechar : str (length 1), optional + Character used to escape other characters. + + comment : str (length 1), optional + Character indicating that the remainder of line should not be + parsed. If found at the beginning of a line, the line will be + ignored altogether. This parameter must be a single character. + Like empty lines (as long as ``skip_blank_lines=True``), fully + commented lines are ignored by the parameter ``header`` but not by + ``skiprows``. For example, if ``comment='#'``, parsing + ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in + ``'a,b,c'`` being treated as the header. + + encoding : str, optional, default 'utf-8' + Encoding to use for UTF when reading/writing (ex. ``'utf-8'``). + `List of Python standard encodings + `_. + + encoding_errors : str, optional, default 'strict' + How encoding errors are treated. `List of possible values + `_. + + versionadded:: 1.3.0 + + dialect : str or csv.Dialect, optional + If provided, this parameter will override values (default or not) + for the following parameters: ``delimiter``, ``doublequote``, + ``escapechar``, ``skipinitialspace``, ``quotechar``, and + ``quoting``. If it is necessary to override values, a + ``ParserWarning`` will be issued. See ``csv.Dialect`` + documentation for more details. + + on_bad_lines : {'error', 'warn', 'skip'} or Callable, default 'error' + Specifies what to do upon encountering a bad line (a line with too + many fields). Allowed values are: + + - ``'error'``, raise an Exception when a bad line is encountered. + - ``'warn'``, raise a warning when a bad line is encountered and + skip that line. + - ``'skip'``, skip bad lines without raising or warning when they + are encountered. + - Callable, function that will process a single bad line. + - With ``engine='python'``, function with signature + ``(bad_line: list[str]) -> list[str] | None``. + ``bad_line`` is a list of strings split by the ``sep``. + If the function returns ``None``, the bad line will be + ignored. + If the function returns a new ``list`` of strings with more + elements than expected, a ``ParserWarning`` will be emitted + while dropping extra elements. + - With ``engine='pyarrow'``, function with signature as + described in pyarrow documentation: `invalid_row_handler + `_. + + versionadded:: 1.3.0 + + versionadded:: 1.4.0 + Callable + + versionchanged:: 2.2.0 + Callable for ``engine='pyarrow'`` + + low_memory : bool, default True + Internally process the file in chunks, resulting in lower memory + use while parsing, but possibly mixed type inference. To ensure + no mixed types either set ``False``, or specify the type with the + ``dtype`` parameter. Note that the entire file is read into a + single :class:`~pandas.DataFrame` regardless, use the + ``chunksize`` or ``iterator`` parameter to return the data in + chunks. (Only valid with C parser). + + memory_map : bool, default False + If a filepath is provided for ``filepath_or_buffer``, map the file + object directly onto memory and access the data directly from + there. Using this option can improve performance because there is + no longer any I/O overhead. + + float_precision : {'high', 'legacy', 'round_trip'}, optional + Specifies which converter the C engine should use for + floating-point values. The options are ``None`` or ``'high'`` for + the ordinary converter, ``'legacy'`` for the original lower + precision pandas converter, and ``'round_trip'`` for the + round-trip converter. + + storage_options : dict, optional + Extra options that make sense for a particular storage connection, + e.g. host, port, username, password, etc. For HTTP(S) URLs the + key-value pairs are forwarded to ``urllib.request.Request`` as + header options. For other URLs (e.g. starting with "s3://", and + "gcs://") the key-value pairs are forwarded to ``fsspec.open``. + Please see ``fsspec`` and ``urllib`` for more details, and for + more examples on storage options refer `here + `_. + + dtype_backend : {'numpy_nullable', 'pyarrow'} + Back-end data type applied to the resultant :class:`DataFrame` + (still experimental). If not specified, the default behavior is + to not use nullable data types. If specified, the behavior is as + follows: + + * ``"numpy_nullable"``: returns nullable-dtype-backed + :class:`DataFrame` + * ``"pyarrow"``: returns pyarrow-backed nullable + :class:`ArrowDtype` :class:`DataFrame` + + versionadded:: 2.0 + + Returns + ------- + + DataFrame or TextFileReader + A comma-separated values (csv) file is returned as two-dimensional + data structure with labeled axes. + + See Also + -------- + DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) + file. + read_table : Read general delimited file into DataFrame. + read_fwf : Read a table of fixed-width formatted lines into DataFrame. + + Examples + -------- + >>> pd.read_csv("data.csv") # doctest: +SKIP + Name Value + 0 foo 1 + 1 bar 2 + 2 #baz 3 + + Index and header can be specified via the `index_col` and `header` + arguments. + + >>> pd.read_csv("data.csv", header=None) # doctest: +SKIP + 0 1 + 0 Name Value + 1 foo 1 + 2 bar 2 + 3 #baz 3 + + >>> pd.read_csv("data.csv", index_col="Value") # doctest: +SKIP + Name + Value + 1 foo + 2 bar + 3 #baz + + Column types are inferred but can be explicitly specified using the + `dtype` argument. + + >>> pd.read_csv("data.csv", dtype={"Value": float}) # doctest: +SKIP + Name Value + 0 foo 1.0 + 1 bar 2.0 + 2 #baz 3.0 + + True, False, and NA values, and thousands separators have defaults, + but can be explicitly specified, too. Supply the values you would like + as strings or lists of strings! + + >>> pd.read_csv("data.csv", na_values=["foo", "bar"]) # doctest: +SKIP + Name Value + 0 NaN 1 + 1 NaN 2 + 2 #baz 3 + + Comment lines in the input file can be skipped using the `comment` + argument. + + >>> pd.read_csv("data.csv", comment="#") # doctest: +SKIP + Name Value + 0 foo 1 + 1 bar 2 + + By default, columns with dates will be read as ``object`` rather than + ``datetime``. + + >>> df = pd.read_csv("tmp.csv") # doctest: +SKIP + + >>> df # doctest: +SKIP + col 1 col 2 col 3 + 0 10 10/04/2018 Sun 15 Jan 2023 + 1 20 15/04/2018 Fri 12 May 2023 + + >>> df.dtypes # doctest: +SKIP + col 1 int64 + col 2 object + col 3 object + dtype: object + + Specific columns can be parsed as dates by using the `parse_dates` and + `date_format` arguments. + + >>> df = pd.read_csv( + "tmp.csv", + parse_dates=[1, 2], + date_format={"col 2": "%d/%m/%Y", "col 3": "%a %d %b %Y"}, + ) # doctest: +SKIP + + >>> df.dtypes # doctest: +SKIP + col 1 int64 + col 2 datetime64[ns] + col 3 datetime64[ns] + dtype: object + """ # locals() should never be modified kwds = locals().copy() del kwds["filepath_or_buffer"] @@ -1399,458 +1441,524 @@ def read_table( dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, ) -> DataFrame | TextFileReader: """ - Read general delimited file into DataFrame. + Read general delimited file into DataFrame. + + Also supports optionally iterating or breaking of the file + into chunks. + + Additional help can be found in the online docs for + `IO Tools `_. + + Parameters + ---------- + + filepath_or_buffer : str, path object or file-like object + Any valid string path is acceptable. The string could be a URL. Valid + URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is + expected. A local file could be: file://localhost/path/to/table.csv. + + If you want to pass in a path object, pandas accepts any ``os.PathLike``. + + By file-like object, we refer to objects with a ``read()`` method, such as + a file handle (e.g. via builtin ``open`` function) or ``StringIO``. + + sep : str, default '\\t' (tab-stop) + Character or regex pattern to treat as the delimiter. If ``sep=None``, the + C engine cannot automatically detect + the separator, but the Python parsing engine can, meaning the latter will + be used and automatically detect the separator from only the first valid + row of the file by Python's builtin sniffer tool, ``csv.Sniffer``. + + In addition, separators longer than 1 character and different from + ``'\\s+'`` will be interpreted as regular expressions and will also force + the use of the Python parsing engine. Note that regex delimiters are prone + to ignoring quoted data. Regex example: ``'\r\t'``. + + delimiter : str, optional + Alias for ``sep``. + + header : int, Sequence of int, 'infer' or None, default 'infer' + Row number(s) containing column labels and marking the start of the + data (zero-indexed). Default behavior is to infer the column names: + if no ``names`` are passed the behavior is identical to ``header=0`` + and column names are inferred from the first line of the file, if + column names are passed explicitly to ``names`` then the behavior is + identical to ``header=None``. Explicitly pass ``header=0`` to be + able to replace existing names. The header can be a list of integers + that specify row locations for a :class:`~pandas.MultiIndex` on the + columns e.g. ``[0, 1, 3]``. Intervening rows that are not specified + will be skipped (e.g. 2 in this example is skipped). Note that this + parameter ignores commented lines and empty lines if + ``skip_blank_lines=True``, so ``header=0`` denotes the first line of + data rather than the first line of the file. + + When inferred from the file contents, headers are kept distinct from + each other by renaming duplicate names with a numeric suffix of the + form ``".{count}"`` starting from 1, e.g. ``"foo"`` and ``"foo.1"``. + Empty headers are named ``"Unnamed: {i}"`` or + ``"Unnamed: {i}_level_{level}"`` in the case of MultiIndex columns. + + names : Sequence of Hashable, optional + Sequence of column labels to apply. If the file contains a header row, + then you should explicitly pass ``header=0`` to override the column + names. Duplicates in this list are not allowed. + + index_col : Hashable, Sequence of Hashable or False, optional + Column(s) to use as row label(s), denoted either by column labels or + column indices. If a sequence of labels or indices is given, + :class:`~pandas.MultiIndex` will be formed for the row labels. + + Note: ``index_col=False`` can be used to force pandas to *not* use + the first column as the index, e.g., when you have a malformed file + with delimiters at the end of each line. + + usecols : Sequence of Hashable or Callable, optional + Subset of columns to select, denoted either by column labels or + column indices. If list-like, all elements must either + be positional (i.e. integer indices into the document columns) or + strings that correspond to column names provided either by the user + in ``names`` or inferred from the document header row(s). If + ``names`` are given, the document header row(s) are not taken into + account. For example, a valid list-like ``usecols`` parameter would + be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``. Element order is + ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. To + instantiate a :class:`~pandas.DataFrame` from ``data`` with element + order preserved use + ``pd.read_table(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` + for columns in ``['foo', 'bar']`` order or + ``pd.read_table(data, usecols=['foo', 'bar'])[['bar', 'foo']]`` + for ``['bar', 'foo']`` order. + + If callable, the callable function will be evaluated against the + column names, returning names where the callable function evaluates + to ``True``. An example of a valid callable argument would be + ``lambda x: x.upper() in ['AAA', 'BBB', 'DDD']``. Using this + parameter results in much faster parsing time and lower memory usage. + + dtype : dtype or dict of {Hashable : dtype}, optional + Data type(s) to apply to either the whole dataset or individual + columns. E.g., ``{'a': np.float64, 'b': np.int32, 'c': 'Int64'}`` + Use ``str`` or ``object`` together with suitable ``na_values`` + settings to preserve and not interpret ``dtype``. + If ``converters`` are specified, they will be applied INSTEAD + of ``dtype`` conversion. + + versionadded:: 1.5.0 + Support for ``defaultdict`` was added. Specify a ``defaultdict`` + as input where the default determines the ``dtype`` of the + columns which are not explicitly listed. + + engine : {'c', 'python', 'pyarrow'}, optional + Parser engine to use. The C and pyarrow engines are faster, while + the python engine is currently more feature-complete. Multithreading + is currently only supported by the pyarrow engine. + + versionadded:: 1.4.0 + The 'pyarrow' engine was added as an *experimental* engine, and + some features are unsupported, or may not work correctly, with + this engine. + + converters : dict of {Hashable : Callable}, optional + Functions for converting values in specified columns. Keys can either + be column labels or column indices. + + true_values : list, optional + Values to consider as ``True`` in addition to case-insensitive + variants of 'True'. + + false_values : list, optional + Values to consider as ``False`` in addition to case-insensitive + variants of 'False'. + + skipinitialspace : bool, default False + Skip spaces after delimiter. + + skiprows : int, list of int or Callable, optional + Line numbers to skip (0-indexed) or number of lines to skip (``int``) + at the start of the file. + + If callable, the callable function will be evaluated against the row + indices, returning ``True`` if the row should be skipped and ``False`` + otherwise. An example of a valid callable argument would be + ``lambda x: x in [0, 2]``. + + skipfooter : int, default 0 + Number of lines at bottom of file to skip (Unsupported with + ``engine='c'``). + + nrows : int, optional + Number of rows of file to read. Useful for reading pieces of large + files. Refers to the number of data rows in the returned DataFrame, + excluding: - Also supports optionally iterating or breaking of the file - into chunks. + â–ª The header row containing column names. + + â–ª Rows before the header row, if ``header=1`` or larger. - Additional help can be found in the online docs for - `IO Tools `_. - Parameters - ---------- - filepath_or_buffer : str, path object or file-like object - Any valid string path is acceptable. The string could be a URL. Valid - URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is - expected. A local file could be: file://localhost/path/to/table.csv. - - If you want to pass in a path object, pandas accepts any ``os.PathLike``. - - By file-like object, we refer to objects with a ``read()`` method, such as - a file handle (e.g. via builtin ``open`` function) or ``StringIO``. - sep : str, default '\\t' (tab-stop) - Character or regex pattern to treat as the delimiter. If ``sep=None``, the - C engine cannot automatically detect - the separator, but the Python parsing engine can, meaning the latter will - be used and automatically detect the separator from only the first valid - row of the file by Python's builtin sniffer tool, ``csv.Sniffer``. - In addition, separators longer than 1 character and different from - ``'\\s+'`` will be interpreted as regular expressions and will also force - the use of the Python parsing engine. Note that regex delimiters are prone - to ignoring quoted data. Regex example: ``'\r\t'``. - delimiter : str, optional - Alias for ``sep``. - header : int, Sequence of int, 'infer' or None, default 'infer' - Row number(s) containing column labels and marking the start of the - data (zero-indexed). Default behavior is to infer the column names: - if no ``names`` are passed the behavior is identical to ``header=0`` - and column names are inferred from the first line of the file, if - column names are passed explicitly to ``names`` then the behavior is - identical to ``header=None``. Explicitly pass ``header=0`` to be - able to replace existing names. The header can be a list of integers - that specify row locations for a :class:`~pandas.MultiIndex` on the - columns e.g. ``[0, 1, 3]``. Intervening rows that are not specified - will be skipped (e.g. 2 in this example is skipped). Note that this - parameter ignores commented lines and empty lines if - ``skip_blank_lines=True``, so ``header=0`` denotes the first line of - data rather than the first line of the file. - - When inferred from the file contents, headers are kept distinct from - each other by renaming duplicate names with a numeric suffix of the - form ``".{count}"`` starting from 1, e.g. ``"foo"`` and ``"foo.1"``. - Empty headers are named ``"Unnamed: {i}"`` or - ``"Unnamed: {i}_level_{level}"`` in the case of MultiIndex columns. - names : Sequence of Hashable, optional - Sequence of column labels to apply. If the file contains a header row, - then you should explicitly pass ``header=0`` to override the column - names. Duplicates in this list are not allowed. - index_col : Hashable, Sequence of Hashable or False, optional - Column(s) to use as row label(s), denoted either by column labels or - column indices. If a sequence of labels or indices is given, - :class:`~pandas.MultiIndex` will be formed for the row labels. - - Note: ``index_col=False`` can be used to force pandas to *not* use - the first column as the index, e.g., when you have a malformed file - with delimiters at the end of each line. - usecols : Sequence of Hashable or Callable, optional - Subset of columns to select, denoted either by column labels or - column indices. If list-like, all elements must either - be positional (i.e. integer indices into the document columns) or - strings that correspond to column names provided either by the user - in ``names`` or inferred from the document header row(s). If - ``names`` are given, the document header row(s) are not taken into - account. For example, a valid list-like ``usecols`` parameter would - be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``. Element order is - ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. To - instantiate a :class:`~pandas.DataFrame` from ``data`` with element - order preserved use - ``pd.read_table(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` - for columns in ``['foo', 'bar']`` order or - ``pd.read_table(data, usecols=['foo', 'bar'])[['bar', 'foo']]`` - for ``['bar', 'foo']`` order. - - If callable, the callable function will be evaluated against the - column names, returning names where the callable function evaluates - to ``True``. An example of a valid callable argument would be - ``lambda x: x.upper() in ['AAA', 'BBB', 'DDD']``. Using this - parameter results in much faster parsing time and lower memory usage. - dtype : dtype or dict of {Hashable : dtype}, optional - Data type(s) to apply to either the whole dataset or individual - columns. E.g., ``{'a': np.float64, 'b': np.int32, 'c': 'Int64'}`` - Use ``str`` or ``object`` together with suitable ``na_values`` - settings to preserve and not interpret ``dtype``. - If ``converters`` are specified, they will be applied INSTEAD - of ``dtype`` conversion. - - versionadded:: 1.5.0 - Support for ``defaultdict`` was added. Specify a ``defaultdict`` - as input where the default determines the ``dtype`` of the - columns which are not explicitly listed. - engine : {'c', 'python', 'pyarrow'}, optional - Parser engine to use. The C and pyarrow engines are faster, while - the python engine is currently more feature-complete. Multithreading - is currently only supported by the pyarrow engine. - - versionadded:: 1.4.0 - The 'pyarrow' engine was added as an *experimental* engine, and - some features are unsupported, or may not work correctly, with - this engine. - converters : dict of {Hashable : Callable}, optional - Functions for converting values in specified columns. Keys can either - be column labels or column indices. - true_values : list, optional - Values to consider as ``True`` in addition to case-insensitive - variants of 'True'. - false_values : list, optional - Values to consider as ``False`` in addition to case-insensitive - variants of 'False'. - skipinitialspace : bool, default False - Skip spaces after delimiter. - skiprows : int, list of int or Callable, optional - Line numbers to skip (0-indexed) or number of lines to skip (``int``) - at the start of the file. - - If callable, the callable function will be evaluated against the row - indices, returning ``True`` if the row should be skipped and ``False`` - otherwise. An example of a valid callable argument would be - ``lambda x: x in [0, 2]``. - skipfooter : int, default 0 - Number of lines at bottom of file to skip (Unsupported with - ``engine='c'``). - nrows : int, optional - Number of rows of file to read. Useful for reading pieces of large - files. Refers to the number of data rows in the returned DataFrame, - excluding: - - * The header row containing column names. - * Rows before the header row, if ``header=1`` or larger. - - Example usage: - - * To read the first 999,999 (non-header) rows: - ``read_table(..., nrows=999999)`` - - * To read rows 1,000,000 through 1,999,999: - ``read_table(..., skiprows=1000000, nrows=999999)`` - na_values : Hashable, Iterable of Hashable or dict of {Hashable : - Iterable}, optional - Additional strings to recognize as ``NA``/``NaN``. If ``dict`` - passed, specific per-column ``NA`` values. By default the following - values are interpreted as ``NaN``: "", "#N/A", "#N/A N/A", "#NA", - "-1.#IND", "-1.#QNAN", "-NaN", "-nan", "1.#IND", "1.#QNAN", "", - "N/A", "NA", "NULL", "NaN", "None", "n/a", "nan", "null". - keep_default_na : bool, default True - Whether or not to include the default ``NaN`` values when parsing - the data. Depending on whether ``na_values`` is passed in, the - behavior is as follows: - - * If ``keep_default_na`` is ``True``, and ``na_values`` are - specified, ``na_values`` is appended to the default ``NaN`` values - used for parsing. - * If ``keep_default_na`` is ``True``, and ``na_values`` are not - specified, only the default ``NaN`` values are used for parsing. - * If ``keep_default_na`` is ``False``, and ``na_values`` are - specified, only the ``NaN`` values specified ``na_values`` are - used for parsing. - * If ``keep_default_na`` is ``False``, and ``na_values`` are not - specified, no strings will be parsed as ``NaN``. - - Note that if ``na_filter`` is passed in as ``False``, the - ``keep_default_na`` and ``na_values`` parameters will be ignored. - na_filter : bool, default True - Detect missing value markers (empty strings and the value of - ``na_values``). In data without any ``NA`` values, passing - ``na_filter=False`` can improve the performance of reading a large - file. - skip_blank_lines : bool, default True - If ``True``, skip over blank lines rather than interpreting as - ``NaN`` values. - parse_dates : bool, None, list of Hashable, default None - The behavior is as follows: - - * ``bool``. If ``True`` -> try parsing the index. - * ``None``. Behaves like ``True`` if ``date_format`` is specified. - * ``list`` of ``int`` or names. e.g. If ``[1, 2, 3]`` -> try - parsing columns 1, 2, 3 each as a separate date column. - - If a column or index cannot be represented as an array of - ``datetime``, say because of an unparsable value or a mixture of - timezones, the column or index will be returned unaltered as an - ``object`` data type. For non-standard ``datetime`` parsing, use - :func:`~pandas.to_datetime` after :func:`~pandas.read_table`. - - Note: A fast-path exists for iso8601-formatted dates. - date_format : str or dict of column -> format, optional - Format to use for parsing dates and/or times when used in conjunction - with ``parse_dates``. The strftime to parse time, e.g. - :const:`"%d/%m/%Y"`. See `strftime documentation - `_ for more information on choices, - though note that :const:`"%f"`` will parse all the way up to - nanoseconds. You can also pass: - - - "ISO8601", to parse any `ISO8601 - `_ time string (not - necessarily in exactly the same format); - - "mixed", to infer the format for each element individually. This - is risky, and you should probably use it along with `dayfirst`. - - versionadded:: 2.0.0 - dayfirst : bool, default False - DD/MM format dates, international and European format. - cache_dates : bool, default True - If ``True``, use a cache of unique, converted dates to apply the - ``datetime`` conversion. May produce significant speed-up when - parsing duplicate date strings, especially ones with timezone - offsets. - iterator : bool, default False - Return ``TextFileReader`` object for iteration or getting chunks - with ``get_chunk()``. - chunksize : int, optional - Number of lines to read from the file per chunk. Passing a value - will cause the function to return a ``TextFileReader`` object for - iteration. See the `IO Tools docs - `_ - for more information on ``iterator`` and ``chunksize``. - compression : str or dict, default 'infer' - For on-the-fly decompression of on-disk data. If 'infer' and - 'filepath_or_buffer' is path-like, then detect compression from the - following extensions: '.gz', '.bz2', '.zip', '.xz', '.zst', '.tar', - '.tar.gz', '.tar.xz' or '.tar.bz2' (otherwise no compression). - If using 'zip' or 'tar', the ZIP file must contain only one data - file to be read in. Set to ``None`` for no decompression. - Can also be a dict with key ``'method'`` set to one of - {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``} - and other key-value pairs are forwarded to ``zipfile.ZipFile``, - ``gzip.GzipFile``, ``bz2.BZ2File``, ``zstandard.ZstdDecompressor``, - ``lzma.LZMAFile`` or ``tarfile.TarFile``, respectively. - As an example, the following could be passed for Zstandard - decompression using a custom compression dictionary: - ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``. - - versionadded:: 1.5.0 - Added support for `.tar` files. - - versionchanged:: 1.4.0 Zstandard support. - thousands : str (length 1), optional - Character acting as the thousands separator in numerical values. - decimal : str (length 1), default '.' - Character to recognize as decimal point (e.g., use ',' for European - data). - lineterminator : str (length 1), optional - Character used to denote a line break. Only valid with C parser. - quotechar : str (length 1), default '"' - Character used to denote the start and end of a quoted item. Quoted - items can include the ``delimiter`` and it will be ignored. - quoting : {0 or csv.QUOTE_MINIMAL, 1 or csv.QUOTE_ALL, - 2 or csv.QUOTE_NONNUMERIC, 3 or csv.QUOTE_NONE}, default - csv.QUOTE_MINIMAL - Control field quoting behavior per ``csv.QUOTE_*`` constants. - Default is ``csv.QUOTE_MINIMAL`` (i.e., 0) which implies that only - fields containing special characters are quoted (e.g., characters - defined in ``quotechar``, ``delimiter``, or ``lineterminator``. - doublequote : bool, default True - When ``quotechar`` is specified and ``quoting`` is not ``QUOTE_NONE``, - indicate whether or not to interpret two consecutive ``quotechar`` - elements INSIDE a field as a single ``quotechar`` element. - escapechar : str (length 1), optional - Character used to escape other characters. - comment : str (length 1), optional - Character indicating that the remainder of line should not be - parsed. If found at the beginning of a line, the line will be - ignored altogether. This parameter must be a single character. Like - empty lines (as long as ``skip_blank_lines=True``), fully commented - lines are ignored by the parameter ``header`` but not by - ``skiprows``. For example, if ``comment='#'``, parsing - ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in - ``'a,b,c'`` being treated as the header. - encoding : str, optional, default 'utf-8' - Encoding to use for UTF when reading/writing (ex. ``'utf-8'``). - `List of Python standard encodings - `_. - encoding_errors : str, optional, default 'strict' - How encoding errors are treated. `List of possible values - `_. - - versionadded:: 1.3.0 - dialect : str or csv.Dialect, optional - If provided, this parameter will override values (default or not) - for the following parameters: ``delimiter``, ``doublequote``, - ``escapechar``, ``skipinitialspace``, ``quotechar``, and ``quoting``. - If it is necessary to override values, a ``ParserWarning`` will be - issued. See ``csv.Dialect`` documentation for more details. - on_bad_lines : {'error', 'warn', 'skip'} or Callable, default 'error' - Specifies what to do upon encountering a bad line (a line with too - many fields). Allowed values are: - - - ``'error'``, raise an Exception when a bad line is encountered. - - ``'warn'``, raise a warning when a bad line is encountered and - skip that line. - - ``'skip'``, skip bad lines without raising or warning when they - are encountered. - - Callable, function that will process a single bad line. - - With ``engine='python'``, function with signature - ``(bad_line: list[str]) -> list[str] | None``. - ``bad_line`` is a list of strings split by the ``sep``. - If the function returns ``None``, the bad line will be ignored. - If the function returns a new ``list`` of strings with more - elements than expected, a ``ParserWarning`` will be emitted - while dropping extra elements. - - With ``engine='pyarrow'``, function with signature - as described in pyarrow documentation: `invalid_row_handler - `_. - - versionadded:: 1.3.0 - - versionadded:: 1.4.0 - Callable - - versionchanged:: 2.2.0 - Callable for ``engine='pyarrow'`` - low_memory : bool, default True - Internally process the file in chunks, resulting in lower memory use - while parsing, but possibly mixed type inference. To ensure no - mixed types either set ``False``, or specify the type with the - ``dtype`` parameter. Note that the entire file is read into a single - :class:`~pandas.DataFrame` regardless, use the ``chunksize`` or - ``iterator`` parameter to return the data in chunks. (Only valid - with C parser). - memory_map : bool, default False - If a filepath is provided for ``filepath_or_buffer``, map the file - object directly onto memory and access the data directly from there. - Using this option can improve performance because there is no longer - any I/O overhead. - float_precision : {'high', 'legacy', 'round_trip'}, optional - Specifies which converter the C engine should use for floating-point - values. The options are ``None`` or ``'high'`` for the ordinary - converter, ``'legacy'`` for the original lower precision pandas - converter, and ``'round_trip'`` for the round-trip converter. - storage_options : dict, optional - Extra options that make sense for a particular storage connection, - e.g. host, port, username, password, etc. For HTTP(S) URLs the - key-value pairs are forwarded to ``urllib.request.Request`` as - header options. For other URLs (e.g. starting with "s3://", and - "gcs://") the key-value pairs are forwarded to ``fsspec.open``. - Please see ``fsspec`` and ``urllib`` for more details, and for more - examples on storage options refer `here - `_. - dtype_backend : {'numpy_nullable', 'pyarrow'} - Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). If not specified, the default behavior - is to not use nullable data types. If specified, the behavior - is as follows: - - * ``"numpy_nullable"``: returns nullable-dtype-backed - :class:`DataFrame` - * ``"pyarrow"``: returns pyarrow-backed nullable - :class:`ArrowDtype` :class:`DataFrame` - - versionadded:: 2.0 + Example usage: - Returns - ------- - DataFrame or TextFileReader - A general delimited file is returned as two-dimensional - data structure with labeled axes. + â–ª To read the first 999,999 (non-header) rows: - See Also - -------- - DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) - file. - read_csv : Read a comma-separated values (csv) file into DataFrame. - read_fwf : Read a table of fixed-width formatted lines into DataFrame. + ``read_table(..., nrows=999999)`` - Examples - -------- - >>> pd.read_table("data.txt") # doctest: +SKIP - Name Value - 0 foo 1 - 1 bar 2 - 2 #baz 3 - - Index and header can be specified via the `index_col` and `header` arguments. - - >>> pd.read_table("data.txt", header=None) # doctest: +SKIP - 0 1 - 0 Name Value - 1 foo 1 - 2 bar 2 - 3 #baz 3 - - >>> pd.read_table("data.txt", index_col="Value") # doctest: +SKIP - Name - Value - 1 foo - 2 bar - 3 #baz - - Column types are inferred but can be explicitly specified using the dtype argument. - - >>> pd.read_table("data.txt", dtype={"Value": float}) # doctest: +SKIP - Name Value - 0 foo 1.0 - 1 bar 2.0 - 2 #baz 3.0 - - True, False, and NA values, and thousands separators have defaults, - but can be explicitly specified, too. Supply the values you would like - as strings or lists of strings! - - >>> pd.read_table("data.txt", na_values=["foo", "bar"]) # doctest: +SKIP - Name Value - 0 NaN 1 - 1 NaN 2 - 2 #baz 3 - - Comment lines in the input file can be skipped using the `comment` argument. - - >>> pd.read_table("data.txt", comment="#") # doctest: +SKIP - Name Value - 0 foo 1 - 1 bar 2 - - By default, columns with dates will be read as ``object`` rather than ``datetime``. - - >>> df = pd.read_table("tmp.txt") # doctest: +SKIP - - >>> df # doctest: +SKIP - col 1 col 2 col 3 - 0 10 10/04/2018 Sun 15 Jan 2023 - 1 20 15/04/2018 Fri 12 May 2023 - - >>> df.dtypes # doctest: +SKIP - col 1 int64 - col 2 object - col 3 object - dtype: object - - Specific columns can be parsed as dates by using the `parse_dates` and - `date_format` arguments. - - >>> df = pd.read_table( - ... "tmp.txt", - ... parse_dates=[1, 2], - ... date_format={"col 2": "%d/%m/%Y", "col 3": "%a %d %b %Y"}, - ... ) # doctest: +SKIP - - >>> df.dtypes # doctest: +SKIP - col 1 int64 - col 2 datetime64[ns] - col 3 datetime64[ns] - dtype: object - """ + â–ª To read rows 1,000,000 through 1,999,999: + + ``read_table(..., skiprows=1000000, nrows=999999)`` + + na_values : Hashable, Iterable of Hashable or dict of {Hashable : + Iterable}, optional + Additional strings to recognize as ``NA``/``NaN``. If ``dict`` + passed, specific per-column ``NA`` values. By default the following + values are interpreted as ``NaN``: "", "#N/A", "#N/A N/A", "#NA", + "-1.#IND", "-1.#QNAN", "-NaN", "-nan", "1.#IND", "1.#QNAN", "", + "N/A", "NA", "NULL", "NaN", "None", "n/a", "nan", "null". + + keep_default_na : bool, default True + Whether or not to include the default ``NaN`` values when parsing + the data. Depending on whether ``na_values`` is passed in, the + behavior is as follows: + + â–ª If ``keep_default_na`` is ``True``, and ``na_values`` are + + specified, ``na_values`` is appended to the default ``NaN`` values + used for parsing. + â–ª If ``keep_default_na`` is ``True``, and ``na_values`` are not + + specified, only the default ``NaN`` values are used for parsing. + â–ª If ``keep_default_na`` is ``False``, and ``na_values`` are + + specified, only the ``NaN`` values specified ``na_values`` are + used for parsing. + â–ª If ``keep_default_na`` is ``False``, and ``na_values`` are not + + specified, no strings will be parsed as ``NaN``. + + Note that if ``na_filter`` is passed in as ``False``, the + ``keep_default_na`` and ``na_values`` parameters will be ignored. + + na_filter : bool, default True + Detect missing value markers (empty strings and the value of + ``na_values``). In data without any ``NA`` values, passing + ``na_filter=False`` can improve the performance of reading a large + file. + + skip_blank_lines : bool, default True + If ``True``, skip over blank lines rather than interpreting as + ``NaN`` values. + + parse_dates : bool, None, list of Hashable, default None + The behavior is as follows: + + â–ª ``bool``. If ``True`` -> try parsing the index. + + â–ª ``None``. Behaves like ``True`` if ``date_format`` is specified. + + â–ª ``list`` of ``int`` or names. e.g. If ``[1, 2, 3]`` -> try + + parsing columns 1, 2, 3 each as a separate date column. + + If a column or index cannot be represented as an array of + ``datetime``, say because of an unparsable value or a mixture of + timezones, the column or index will be returned unaltered as an + ``object`` data type. For non-standard ``datetime`` parsing, use + :func:`~pandas.to_datetime` after :func:`~pandas.read_table`. + + Note: A fast-path exists for iso8601-formatted dates. + + date_format : str or dict of column -> format, optional + Format to use for parsing dates and/or times when used in conjunction + with ``parse_dates``. The strftime to parse time, e.g. + :const:`"%d/%m/%Y"`. See `strftime documentation + `_ for more information on choices, + though note that :const:`"%f"`` will parse all the way up to + nanoseconds. You can also pass: + + â–ª "ISO8601", to parse any `ISO8601 + + `_ time string (not + necessarily in exactly the same format); + â–ª "mixed", to infer the format for each element individually. This + + is risky, and you should probably use it along with `dayfirst`. + + versionadded:: 2.0.0 + + dayfirst : bool, default False + DD/MM format dates, international and European format. + + cache_dates : bool, default True + If ``True``, use a cache of unique, converted dates to apply the + ``datetime`` conversion. May produce significant speed-up when + parsing duplicate date strings, especially ones with timezone + offsets. + + iterator : bool, default False + Return ``TextFileReader`` object for iteration or getting chunks + with ``get_chunk()``. + + chunksize : int, optional + Number of lines to read from the file per chunk. Passing a value + will cause the function to return a ``TextFileReader`` object for + iteration. See the `IO Tools docs + `_ + for more information on ``iterator`` and ``chunksize``. + + compression : str or dict, default 'infer' + For on-the-fly decompression of on-disk data. If 'infer' and + 'filepath_or_buffer' is path-like, then detect compression from the + following extensions: '.gz', '.bz2', '.zip', '.xz', '.zst', '.tar', + '.tar.gz', '.tar.xz' or '.tar.bz2' (otherwise no compression). + If using 'zip' or 'tar', the ZIP file must contain only one data + file to be read in. Set to ``None`` for no decompression. + Can also be a dict with key ``'method'`` set to one of + {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``} + and other key-value pairs are forwarded to ``zipfile.ZipFile``, + ``gzip.GzipFile``, ``bz2.BZ2File``, ``zstandard.ZstdDecompressor``, + ``lzma.LZMAFile`` or ``tarfile.TarFile``, respectively. + As an example, the following could be passed for Zstandard + decompression using a custom compression dictionary: + ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``. + + versionadded:: 1.5.0 + Added support for `.tar` files. + + versionchanged:: 1.4.0 Zstandard support. + + thousands : str (length 1), optional + Character acting as the thousands separator in numerical values. + + decimal : str (length 1), default '.' + Character to recognize as decimal point (e.g., use ',' for European + data). + + lineterminator : str (length 1), optional + Character used to denote a line break. Only valid with C parser. + + quotechar : str (length 1), default '"' + Character used to denote the start and end of a quoted item. Quoted + items can include the ``delimiter`` and it will be ignored. + + quoting : {0 or csv.QUOTE_MINIMAL, 1 or csv.QUOTE_ALL, + 2 or csv.QUOTE_NONNUMERIC, 3 or csv.QUOTE_NONE}, default + csv.QUOTE_MINIMAL + Control field quoting behavior per ``csv.QUOTE_*`` constants. + Default is ``csv.QUOTE_MINIMAL`` (i.e., 0) which implies that only + fields containing special characters are quoted (e.g., characters + defined in ``quotechar``, ``delimiter``, or ``lineterminator``. + + doublequote : bool, default True + When ``quotechar`` is specified and ``quoting`` is not ``QUOTE_NONE``, + indicate whether or not to interpret two consecutive ``quotechar`` + elements INSIDE a field as a single ``quotechar`` element. + + escapechar : str (length 1), optional + Character used to escape other characters. + + comment : str (length 1), optional + Character indicating that the remainder of line should not be + parsed. If found at the beginning of a line, the line will be + ignored altogether. This parameter must be a single character. Like + empty lines (as long as ``skip_blank_lines=True``), fully commented + lines are ignored by the parameter ``header`` but not by + ``skiprows``. For example, if ``comment='#'``, parsing + ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in + ``'a,b,c'`` being treated as the header. + + encoding : str, optional, default 'utf-8' + Encoding to use for UTF when reading/writing (ex. ``'utf-8'``). + `List of Python standard encodings + `_. + + encoding_errors : str, optional, default 'strict' + How encoding errors are treated. `List of possible values + `_. + + versionadded:: 1.3.0 + + dialect : str or csv.Dialect, optional + If provided, this parameter will override values (default or not) + for the following parameters: ``delimiter``, ``doublequote``, + ``escapechar``, ``skipinitialspace``, ``quotechar``, and ``quoting``. + If it is necessary to override values, a ``ParserWarning`` will be + issued. See ``csv.Dialect`` documentation for more details. + + on_bad_lines : {'error', 'warn', 'skip'} or Callable, default 'error' + Specifies what to do upon encountering a bad line (a line with too + many fields). Allowed values are: + + â–ª ``'error'``, raise an Exception when a bad line is encountered. + + â–ª ``'warn'``, raise a warning when a bad line is encountered and + + skip that line. + â–ª ``'skip'``, skip bad lines without raising or warning when they + + are encountered. + â–ª Callable, function that will process a single bad line. + + â–ª With ``engine='python'``, function with signature + + ``(bad_line: list[str]) -> list[str] | None``. + ``bad_line`` is a list of strings split by the ``sep``. + If the function returns ``None``, the bad line will be ignored. + If the function returns a new ``list`` of strings with more + elements than expected, a ``ParserWarning`` will be emitted + while dropping extra elements. + â–ª With ``engine='pyarrow'``, function with signature + + as described in pyarrow documentation: `invalid_row_handler + `_. + + versionadded:: 1.3.0 + + versionadded:: 1.4.0 + Callable + + versionchanged:: 2.2.0 + Callable for ``engine='pyarrow'`` + + low_memory : bool, default True + Internally process the file in chunks, resulting in lower memory use + while parsing, but possibly mixed type inference. To ensure no + mixed types either set ``False``, or specify the type with the + ``dtype`` parameter. Note that the entire file is read into a single + :class:`~pandas.DataFrame` regardless, use the ``chunksize`` or + ``iterator`` parameter to return the data in chunks. (Only valid + with C parser). + + memory_map : bool, default False + If a filepath is provided for ``filepath_or_buffer``, map the file + object directly onto memory and access the data directly from there. + Using this option can improve performance because there is no longer + any I/O overhead. + + float_precision : {'high', 'legacy', 'round_trip'}, optional + Specifies which converter the C engine should use for floating-point + values. The options are ``None`` or ``'high'`` for the ordinary + converter, ``'legacy'`` for the original lower precision pandas + converter, and ``'round_trip'`` for the round-trip converter. + + storage_options : dict, optional + Extra options that make sense for a particular storage connection, + e.g. host, port, username, password, etc. For HTTP(S) URLs the + key-value pairs are forwarded to ``urllib.request.Request`` as + header options. For other URLs (e.g. starting with "s3://", and + "gcs://") the key-value pairs are forwarded to ``fsspec.open``. + Please see ``fsspec`` and ``urllib`` for more details, and for more + examples on storage options refer `here + `_. + + dtype_backend : {'numpy_nullable', 'pyarrow'} + Back-end data type applied to the resultant :class:`DataFrame` + (still experimental). If not specified, the default behavior + is to not use nullable data types. If specified, the behavior + is as follows: + + â–ª ``"numpy_nullable"``: returns nullable-dtype-backed + + :class:`DataFrame` + â–ª ``"pyarrow"``: returns pyarrow-backed nullable + + :class:`ArrowDtype` :class:`DataFrame` + + versionadded:: 2.0 + + Returns + ------- + + DataFrame or TextFileReader + A general delimited file is returned as two-dimensional + data structure with labeled axes. + + See Also + -------- + DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) + file. + read_csv : Read a comma-separated values (csv) file into DataFrame. + read_fwf : Read a table of fixed-width formatted lines into DataFrame. + + Examples + -------- + >>> pd.read_table("data.txt") # doctest: +SKIP + Name Value + 0 foo 1 + 1 bar 2 + 2 #baz 3 + + Index and header can be specified via the `index_col` and `header` arguments. + + >>> pd.read_table("data.txt", header=None) # doctest: +SKIP + 0 1 + 0 Name Value + 1 foo 1 + 2 bar 2 + 3 #baz 3 + + >>> pd.read_table("data.txt", index_col="Value") # doctest: +SKIP + Name Value + 1 foo + 2 bar + 3 #baz + + Column types are inferred but can be explicitly specified using the dtype argument. + + >>> pd.read_table("data.txt", dtype={"Value": float}) # doctest: +SKIP + Name Value + 0 foo 1.0 + 1 bar 2.0 + 2 #baz 3.0 + + True, False, and NA values, and thousands separators have defaults, + but can be explicitly specified, too. Supply the values you would like + as strings or lists of strings! + + >>> pd.read_table("data.txt", na_values=["foo", "bar"]) # doctest: +SKIP + Name Value + 0 NaN 1 + 1 NaN 2 + 2 #baz 3 + + Comment lines in the input file can be skipped using the `comment` argument. + + >>> pd.read_table("data.txt", comment="#") # doctest: +SKIP + Name Value + 0 foo 1 + 1 bar 2 + + By default, columns with dates will be read as ``object`` rather than ``datetime``. + + >>> df = pd.read_table("tmp.txt") # doctest: +SKIP + + >>> df # doctest: +SKIP + col 1 col 2 col 3 + 0 10 10/04/2018 Sun 15 Jan 2023 + 1 20 15/04/2018 Fri 12 May 2023 + + >>> df.dtypes # doctest: +SKIP + col 1 int64 + col 2 object + col 3 object + dtype: object + + Specific columns can be parsed as dates by using the `parse_dates` and + `date_format` arguments. + + >>> df = pd.read_table( + ... "tmp.txt", + ... parse_dates=[1, 2], + ... date_format={"col 2": "%d/%m/%Y", "col 3": "%a %d %b %Y"}, + ... ) # doctest: +SKIP + + >>> df.dtypes # doctest: +SKIP + col 1 int64 + col 2 datetime64[ns] + col 3 datetime64[ns] + dtype: object + """ @overload From 6a2f3bad070270d0c20e669fcae1c1b0a22602e6 Mon Sep 17 00:00:00 2001 From: huhu-dsy Date: Thu, 16 Oct 2025 14:49:32 +0800 Subject: [PATCH 09/24] Update readers.py --- pandas/io/parsers/readers.py | 2009 +++++++++++++++++----------------- 1 file changed, 1016 insertions(+), 993 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 7d982d6798730..2eb9253d9ad4d 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -817,511 +817,533 @@ def read_csv( dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, ) -> DataFrame | TextFileReader: r""" - Read a comma-separated values (csv) file into DataFrame. - - Also supports optionally iterating or breaking of the file - into chunks. - - Additional help can be found in the online docs for - `IO Tools `_. - - Parameters - ---------- - - filepath_or_buffer : str, path object or file-like object - Any valid string path is acceptable. The string could be a URL. Valid - URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is - expected. A local file could be: file://localhost/path/to/table.csv. - - If you want to pass in a path object, pandas accepts any ``os.PathLike``. - - By file-like object, we refer to objects with a ``read()`` method, such as - a file handle (e.g. via builtin ``open`` function) or ``StringIO``. - - sep : str, default ',' - Character or regex pattern to treat as the delimiter. If ``sep=None``, the - C engine cannot automatically detect - the separator, but the Python parsing engine can, meaning the latter will - be used and automatically detect the separator from only the first valid - row of the file by Python's builtin sniffer tool, ``csv.Sniffer``. - In addition, separators longer than 1 character and different from - ``'\s+'`` will be interpreted as regular expressions and will also force - the use of the Python parsing engine. Note that regex delimiters are prone - to ignoring quoted data. Regex example: ``'\r\t'``. - - delimiter : str, optional - Alias for ``sep``. - - header : int, Sequence of int, 'infer' or None, default 'infer' - Row number(s) containing column labels and marking the start of the - data (zero-indexed). Default behavior is to infer the column names: if no - ``names`` - are passed the behavior is identical to ``header=0`` and column - names are inferred from the first line of the file, if column - names are passed explicitly to ``names`` then the behavior is identical to - ``header=None``. Explicitly pass ``header=0`` to be able to - replace existing names. The header can be a list of integers that - specify row locations for a :class:`~pandas.MultiIndex` on the columns - e.g. ``[0, 1, 3]``. Intervening rows that are not specified will be - skipped (e.g. 2 in this example is skipped). Note that this - parameter ignores commented lines and empty lines if - ``skip_blank_lines=True``, so ``header=0`` denotes the first line of - data rather than the first line of the file. - - When inferred from the file contents, headers are kept distinct from - each other by renaming duplicate names with a numeric suffix of the form - ``".{count}"`` starting from 1, e.g. ``"foo"`` and ``"foo.1"``. - Empty headers are named ``"Unnamed: {i}"`` or ``"Unnamed: {i}_level_{level}"`` - in the case of MultiIndex columns. - - names : Sequence of Hashable, optional - Sequence of column labels to apply. If the file contains a header row, - then you should explicitly pass ``header=0`` to override the column names. - Duplicates in this list are not allowed. - - index_col : Hashable, Sequence of Hashable or False, optional - Column(s) to use as row label(s), denoted either by column labels or column - indices. If a sequence of labels or indices is given, - :class:`~pandas.MultiIndex` - will be formed for the row labels. - - Note: ``index_col=False`` can be used to force pandas to *not* use the first - column as the index, e.g., when you have a malformed file with delimiters at - the end of each line. - - usecols : Sequence of Hashable or Callable, optional - Subset of columns to select, denoted either by column labels or column indices. - If list-like, all elements must either - be positional (i.e. integer indices into the document columns) or strings - that correspond to column names provided either by the user in ``names`` or - inferred from the document header row(s). If ``names`` are given, the document - header row(s) are not taken into account. For example, a valid list-like - ``usecols`` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``. - Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. - To instantiate a :class:`~pandas.DataFrame` from ``data`` with element order - preserved use ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` - for columns in ``['foo', 'bar']`` order or - ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]`` - for ``['bar', 'foo']`` order. - - If callable, the callable function will be evaluated against the column - names, returning names where the callable function evaluates to ``True``. An - example of a valid callable argument would be ``lambda x: x.upper() in - ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster - parsing time and lower memory usage. - - dtype : dtype or dict of {Hashable : dtype}, optional - Data type(s) to apply to either the whole dataset or individual columns. - E.g., ``{'a': np.float64, 'b': np.int32, 'c': 'Int64'}`` - Use ``str`` or ``object`` together with suitable ``na_values`` settings - to preserve and not interpret ``dtype``. - If ``converters`` are specified, they will be applied INSTEAD - of ``dtype`` conversion. - - versionadded:: 1.5.0 - Support for ``defaultdict`` was added. Specify a ``defaultdict`` as input - where - the default determines the ``dtype`` of the columns which are not explicitly - listed. - - engine : {'c', 'python', 'pyarrow'}, optional - Parser engine to use. The C and pyarrow engines are faster, - while the python engine - is currently more feature-complete. - Multithreading is currently only supported by - the pyarrow engine. - - versionadded:: 1.4.0 - The 'pyarrow' engine was added as an *experimental* engine, - and some features - are unsupported, or may not work correctly, with this engine. - - converters : dict of {Hashable : Callable}, optional - Functions for converting values in specified columns. Keys can either - be column labels or column indices. - - true_values : list, optional - Values to consider as ``True`` - in addition to case-insensitive variants of 'True'. - - false_values : list, optional - Values to consider as ``False`` - in addition to case-insensitive variants of 'False'. - - skipinitialspace : bool, default False - Skip spaces after delimiter. - - skiprows : int, list of int or Callable, optional - Line numbers to skip (0-indexed) or number of lines to skip (``int``) - at the start of the file. - - If callable, the callable function will be evaluated against the row - indices, returning ``True`` if the row - should be skipped and ``False`` otherwise. - An example of a valid callable argument would be ``lambda x: x in [0, 2]``. - - skipfooter : int, default 0 - Number of lines at bottom of file to skip (Unsupported with ``engine='c'``). - - nrows : int, optional - Number of rows of file to read. Useful for reading pieces of large files. - Refers to the number of data rows in the returned DataFrame, excluding: - - * The header row containing column names. - * Rows before the header row, if ``header=1`` or larger. - - Example usage: - - * To read the first 999,999 (non-header) rows: - ``read_csv(..., nrows=999999)`` - - * To read rows 1,000,000 through 1,999,999: - ``read_csv(..., skiprows=1000000, nrows=999999)`` - - na_values : Hashable, Iterable of Hashable or dict of {Hashable : Iterable}, - optional - Additional strings to recognize as ``NA``/``NaN``. If ``dict`` passed, specific - per-column ``NA`` values. By default the following values are interpreted as - ``NaN``: "", "#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN", - "-nan", "1.#IND", "1.#QNAN", "", "N/A", "NA", "NULL", "NaN", - "None", "n/a", "nan", "null". - - keep_default_na : bool, default True - Whether or not to include the default ``NaN`` values when parsing the data. - Depending on whether ``na_values`` is passed in, the behavior is as follows: - - * If ``keep_default_na`` is ``True``, and ``na_values`` are specified, - ``na_values`` - is appended to the default ``NaN`` values used for parsing. - * If ``keep_default_na`` is ``True``, and ``na_values`` are not specified, only - the default ``NaN`` values are used for parsing. - * If ``keep_default_na`` is ``False``, and ``na_values`` are specified, only - the ``NaN`` values specified ``na_values`` are used for parsing. - * If ``keep_default_na`` is ``False``, and ``na_values`` are not specified, no - strings will be parsed as ``NaN``. - - Note that if ``na_filter`` is passed in as ``False``, - the ``keep_default_na`` and - ``na_values`` parameters will be ignored. - - na_filter : bool, default True - Detect missing value markers (empty strings and the value of ``na_values``). In - data without any ``NA`` values, passing ``na_filter=False`` can improve the - performance of reading a large file. - - skip_blank_lines : bool, default True - If ``True``, skip over blank lines rather than interpreting as ``NaN`` values. - - parse_dates : bool, None, list of Hashable, default None - The behavior is as follows: - - * ``bool``. If ``True`` -> try parsing the index. - * ``None``. Behaves like ``True`` if ``date_format`` is specified. - * ``list`` of ``int`` or names. e.g. If ``[1, 2, 3]`` -> try parsing - columns 1, 2, 3 - each as a separate date column. - - If a column or index cannot be represented as an array of ``datetime``, - say because of an unparsable value or a mixture of timezones, the column - or index will be returned unaltered as an ``object`` data type. For - non-standard ``datetime`` parsing, use :func:`~pandas.to_datetime` after - :func:`~pandas.read_csv`. - - Note: A fast-path exists for iso8601-formatted dates. - - date_format : str or dict of column -> format, optional - Format to use for parsing dates and/or times - when used in conjunction with ``parse_dates``. - The strftime to parse time, e.g. :const:`"%d/%m/%Y"`. See - `strftime documentation - `_ for more information on choices, though - note that :const:`"%f"` will parse all the way up to nanoseconds. - You can also pass: - - - "ISO8601", to parse any `ISO8601 `_ - time string (not necessarily in exactly the same format); - - "mixed", to infer the format for each element individually. This is risky, - and you should probably use it along with `dayfirst`. - - versionadded:: 2.0.0 - - dayfirst : bool, default False - DD/MM format dates, international and European format. - - cache_dates : bool, default True - If ``True``, use a cache of unique, converted dates to apply the ``datetime`` - conversion. May produce significant speed-up when parsing duplicate - date strings, especially ones with timezone offsets. - - iterator : bool, default False - Return ``TextFileReader`` object for iteration or getting chunks with - ``get_chunk()``. - - chunksize : int, optional - Number of lines to read from the file per chunk. Passing a value will cause the - function to return a ``TextFileReader`` object for iteration. - See the `IO Tools docs - `_ - for more information on ``iterator`` and ``chunksize``. - - compression : str or dict, default 'infer' - For on-the-fly decompression of on-disk data. If 'infer' and - 'filepath_or_buffer' is - path-like, then detect compression from the following extensions: '.gz', - '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or ' - .tar.bz2' (otherwise no compression). - If using 'zip' or 'tar', the ZIP file must contain only one data - file to be read in. - Set to ``None`` for no decompression. - Can also be a dict with key ``'method'`` set to one of {``'zip'``, - ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``} and other - key-value pairs are forwarded to ``zipfile.ZipFile``, - ``gzip.GzipFile``, ``bz2.BZ2File``, ``zstandard.ZstdDecompressor``, - ``lzma.LZMAFile`` or ``tarfile.TarFile``, respectively. - As an example, the following could be passed for Zstandard - decompression using a custom compression dictionary: - ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``. - - versionadded:: 1.5.0 - Added support for `.tar` files. - - versionchanged:: 1.4.0 Zstandard support. - - thousands : str (length 1), optional - Character acting as the thousands separator in numerical values. - - decimal : str (length 1), default '.' - Character to recognize as decimal point (e.g., use ',' for European - data). - - lineterminator : str (length 1), optional - Character used to denote a line break. Only valid with C parser. - - quotechar : str (length 1), optional - Character used to denote the start and end of a quoted item. Quoted - items can include the ``delimiter`` and it will be ignored. - - quoting : {0 or csv.QUOTE_MINIMAL, 1 or csv.QUOTE_ALL, 2 or - csv.QUOTE_NONNUMERIC, 3 or csv.QUOTE_NONE}, default - csv.QUOTE_MINIMAL - Control field quoting behavior per ``csv.QUOTE_*`` constants. - Default is ``csv.QUOTE_MINIMAL`` (i.e., 0) which implies that only - fields containing special characters are quoted (e.g., characters - defined in ``quotechar``, ``delimiter``, or ``lineterminator``. - - doublequote : bool, default True - When ``quotechar`` is specified and ``quoting`` is not - ``QUOTE_NONE``, indicate whether or not to interpret two - consecutive ``quotechar`` elements INSIDE a field as a single - ``quotechar`` element. - - escapechar : str (length 1), optional - Character used to escape other characters. - - comment : str (length 1), optional - Character indicating that the remainder of line should not be - parsed. If found at the beginning of a line, the line will be - ignored altogether. This parameter must be a single character. - Like empty lines (as long as ``skip_blank_lines=True``), fully - commented lines are ignored by the parameter ``header`` but not by - ``skiprows``. For example, if ``comment='#'``, parsing - ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in - ``'a,b,c'`` being treated as the header. - - encoding : str, optional, default 'utf-8' - Encoding to use for UTF when reading/writing (ex. ``'utf-8'``). - `List of Python standard encodings - `_. - - encoding_errors : str, optional, default 'strict' - How encoding errors are treated. `List of possible values - `_. - - versionadded:: 1.3.0 - - dialect : str or csv.Dialect, optional - If provided, this parameter will override values (default or not) - for the following parameters: ``delimiter``, ``doublequote``, - ``escapechar``, ``skipinitialspace``, ``quotechar``, and - ``quoting``. If it is necessary to override values, a - ``ParserWarning`` will be issued. See ``csv.Dialect`` - documentation for more details. - - on_bad_lines : {'error', 'warn', 'skip'} or Callable, default 'error' - Specifies what to do upon encountering a bad line (a line with too - many fields). Allowed values are: - - - ``'error'``, raise an Exception when a bad line is encountered. - - ``'warn'``, raise a warning when a bad line is encountered and - skip that line. - - ``'skip'``, skip bad lines without raising or warning when they - are encountered. - - Callable, function that will process a single bad line. - - With ``engine='python'``, function with signature - ``(bad_line: list[str]) -> list[str] | None``. - ``bad_line`` is a list of strings split by the ``sep``. - If the function returns ``None``, the bad line will be - ignored. - If the function returns a new ``list`` of strings with more - elements than expected, a ``ParserWarning`` will be emitted - while dropping extra elements. - - With ``engine='pyarrow'``, function with signature as - described in pyarrow documentation: `invalid_row_handler - `_. - - versionadded:: 1.3.0 - - versionadded:: 1.4.0 - Callable - - versionchanged:: 2.2.0 - Callable for ``engine='pyarrow'`` - - low_memory : bool, default True - Internally process the file in chunks, resulting in lower memory - use while parsing, but possibly mixed type inference. To ensure - no mixed types either set ``False``, or specify the type with the - ``dtype`` parameter. Note that the entire file is read into a - single :class:`~pandas.DataFrame` regardless, use the - ``chunksize`` or ``iterator`` parameter to return the data in - chunks. (Only valid with C parser). - - memory_map : bool, default False - If a filepath is provided for ``filepath_or_buffer``, map the file - object directly onto memory and access the data directly from - there. Using this option can improve performance because there is - no longer any I/O overhead. - - float_precision : {'high', 'legacy', 'round_trip'}, optional - Specifies which converter the C engine should use for - floating-point values. The options are ``None`` or ``'high'`` for - the ordinary converter, ``'legacy'`` for the original lower - precision pandas converter, and ``'round_trip'`` for the - round-trip converter. - - storage_options : dict, optional - Extra options that make sense for a particular storage connection, - e.g. host, port, username, password, etc. For HTTP(S) URLs the - key-value pairs are forwarded to ``urllib.request.Request`` as - header options. For other URLs (e.g. starting with "s3://", and - "gcs://") the key-value pairs are forwarded to ``fsspec.open``. - Please see ``fsspec`` and ``urllib`` for more details, and for - more examples on storage options refer `here - `_. - - dtype_backend : {'numpy_nullable', 'pyarrow'} - Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). If not specified, the default behavior is - to not use nullable data types. If specified, the behavior is as - follows: - - * ``"numpy_nullable"``: returns nullable-dtype-backed - :class:`DataFrame` - * ``"pyarrow"``: returns pyarrow-backed nullable - :class:`ArrowDtype` :class:`DataFrame` - - versionadded:: 2.0 - - Returns - ------- - - DataFrame or TextFileReader - A comma-separated values (csv) file is returned as two-dimensional - data structure with labeled axes. - - See Also - -------- - DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) - file. - read_table : Read general delimited file into DataFrame. - read_fwf : Read a table of fixed-width formatted lines into DataFrame. - - Examples - -------- - >>> pd.read_csv("data.csv") # doctest: +SKIP - Name Value - 0 foo 1 - 1 bar 2 - 2 #baz 3 - - Index and header can be specified via the `index_col` and `header` - arguments. - - >>> pd.read_csv("data.csv", header=None) # doctest: +SKIP - 0 1 - 0 Name Value - 1 foo 1 - 2 bar 2 - 3 #baz 3 - - >>> pd.read_csv("data.csv", index_col="Value") # doctest: +SKIP - Name - Value - 1 foo - 2 bar - 3 #baz - - Column types are inferred but can be explicitly specified using the - `dtype` argument. - - >>> pd.read_csv("data.csv", dtype={"Value": float}) # doctest: +SKIP - Name Value - 0 foo 1.0 - 1 bar 2.0 - 2 #baz 3.0 - - True, False, and NA values, and thousands separators have defaults, - but can be explicitly specified, too. Supply the values you would like - as strings or lists of strings! - - >>> pd.read_csv("data.csv", na_values=["foo", "bar"]) # doctest: +SKIP - Name Value - 0 NaN 1 - 1 NaN 2 - 2 #baz 3 - - Comment lines in the input file can be skipped using the `comment` - argument. - - >>> pd.read_csv("data.csv", comment="#") # doctest: +SKIP - Name Value - 0 foo 1 - 1 bar 2 - - By default, columns with dates will be read as ``object`` rather than - ``datetime``. - - >>> df = pd.read_csv("tmp.csv") # doctest: +SKIP - - >>> df # doctest: +SKIP - col 1 col 2 col 3 - 0 10 10/04/2018 Sun 15 Jan 2023 - 1 20 15/04/2018 Fri 12 May 2023 - - >>> df.dtypes # doctest: +SKIP - col 1 int64 - col 2 object - col 3 object - dtype: object - - Specific columns can be parsed as dates by using the `parse_dates` and - `date_format` arguments. - - >>> df = pd.read_csv( - "tmp.csv", - parse_dates=[1, 2], - date_format={"col 2": "%d/%m/%Y", "col 3": "%a %d %b %Y"}, - ) # doctest: +SKIP - - >>> df.dtypes # doctest: +SKIP - col 1 int64 - col 2 datetime64[ns] - col 3 datetime64[ns] - dtype: object - """ + Read a comma-separated values (csv) file into DataFrame. + + Also supports optionally iterating or breaking of the file + into chunks. + + Additional help can be found in the online docs for + `IO Tools `_. + + Parameters + ---------- + + filepath_or_buffer : str, path object or file-like object + Any valid string path is acceptable. The string could be a URL. Valid + URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is + expected. A local file could be: file://localhost/path/to/table.csv. + + If you want to pass in a path object, pandas accepts any ``os.PathLike``. + + By file-like object, we refer to objects with a ``read()`` method, such as + a file handle (e.g. via builtin ``open`` function) or ``StringIO``. + + sep : str, default ',' + Character or regex pattern to treat as the delimiter. If ``sep=None``, the + C engine cannot automatically detect + the separator, but the Python parsing engine can, meaning the latter will + be used and automatically detect the separator from only the first valid + row of the file by Python's builtin sniffer tool, ``csv.Sniffer``. + In addition, separators longer than 1 character and different from + ``'\s+'`` will be interpreted as regular expressions and will also force + the use of the Python parsing engine. Note that regex delimiters are prone + to ignoring quoted data. Regex example: ``'\r\t'``. + + delimiter : str, optional + Alias for ``sep``. + + header : int, Sequence of int, 'infer' or None, default 'infer' + Row number(s) containing column labels and marking the start of the + data (zero-indexed). Default behavior is to infer the column names: if no + ``names`` + are passed the behavior is identical to ``header=0`` and column + names are inferred from the first line of the file, if column + names are passed explicitly to ``names`` then the behavior is identical to + ``header=None``. Explicitly pass ``header=0`` to be able to + replace existing names. The header can be a list of integers that + specify row locations for a :class:`~pandas.MultiIndex` on the columns + e.g. ``[0, 1, 3]``. Intervening rows that are not specified will be + skipped (e.g. 2 in this example is skipped). Note that this + parameter ignores commented lines and empty lines if + ``skip_blank_lines=True``, so ``header=0`` denotes the first line of + data rather than the first line of the file. + + When inferred from the file contents, headers are kept distinct from + each other by renaming duplicate names with a numeric suffix of the form + ``".{count}"`` starting from 1, e.g. ``"foo"`` and ``"foo.1"``. + Empty headers are named + ``"Unnamed: {i}"`` or ``"Unnamed: {i}_level_{level}"`` + in the case of MultiIndex columns. + + names : Sequence of Hashable, optional + Sequence of column labels to apply. If the file contains a header row, + then you should explicitly pass ``header=0`` to override the column names. + Duplicates in this list are not allowed. + + index_col : Hashable, Sequence of Hashable or False, optional + Column(s) to use as row label(s), denoted either by column labels or column + indices. If a sequence of labels or indices is given, + :class:`~pandas.MultiIndex` + will be formed for the row labels. + + Note: ``index_col=False`` can be used to force pandas to *not* use the first + column as the index, e.g., when you have a malformed file with delimiters at + the end of each line. + + usecols : Sequence of Hashable or Callable, optional + Subset of columns to select, denoted either by column + labels or column indices. + If list-like, all elements must either + be positional (i.e. integer indices into the document columns) or strings + that correspond to column names provided either by the user in ``names`` or + inferred from the document header row(s). If ``names`` + are given, the document + header row(s) are not taken into account. For example, a valid list-like + ``usecols`` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``. + Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. + To instantiate a :class:`~pandas.DataFrame` from + ``data`` with element order + preserved use ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` + for columns in ``['foo', 'bar']`` order or + ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]`` + for ``['bar', 'foo']`` order. + + If callable, the callable function will be evaluated against the column + names, returning names where the callable function + evaluates to ``True``. An + example of a valid callable argument would be ``lambda x: x.upper() in + ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster + parsing time and lower memory usage. + + dtype : dtype or dict of {Hashable : dtype}, optional + Data type(s) to apply to either the whole dataset or individual columns. + E.g., ``{'a': np.float64, 'b': np.int32, 'c': 'Int64'}`` + Use ``str`` or ``object`` together with suitable ``na_values`` settings + to preserve and not interpret ``dtype``. + If ``converters`` are specified, they will be applied INSTEAD + of ``dtype`` conversion. + + versionadded:: 1.5.0 + Support for ``defaultdict`` was added. + Specify a ``defaultdict`` as input + where + the default determines the ``dtype`` of the columns + which are not explicitly + listed. + + engine : {'c', 'python', 'pyarrow'}, optional + Parser engine to use. The C and pyarrow engines are faster, + while the python engine + is currently more feature-complete. + Multithreading is currently only supported by + the pyarrow engine. + + versionadded:: 1.4.0 + The 'pyarrow' engine was added as an *experimental* engine, + and some features + are unsupported, or may not work correctly, with this engine. + + converters : dict of {Hashable : Callable}, optional + Functions for converting values in specified columns. Keys can either + be column labels or column indices. + + true_values : list, optional + Values to consider as ``True`` + in addition to case-insensitive variants of 'True'. + + false_values : list, optional + Values to consider as ``False`` + in addition to case-insensitive variants of 'False'. + + skipinitialspace : bool, default False + Skip spaces after delimiter. + + skiprows : int, list of int or Callable, optional + Line numbers to skip (0-indexed) or number of lines to skip (``int``) + at the start of the file. + + If callable, the callable function will be evaluated against the row + indices, returning ``True`` if the row + should be skipped and ``False`` otherwise. + An example of a valid callable argument would be + ``lambda x: x in [0, 2]``. + + skipfooter : int, default 0 + Number of lines at bottom of file to + skip (Unsupported with ``engine='c'``). + + nrows : int, optional + Number of rows of file to read. Useful for reading pieces of large files. + Refers to the number of data rows in the returned DataFrame, excluding: + + * The header row containing column names. + * Rows before the header row, if ``header=1`` or larger. + + Example usage: + + * To read the first 999,999 (non-header) rows: + ``read_csv(..., nrows=999999)`` + + * To read rows 1,000,000 through 1,999,999: + ``read_csv(..., skiprows=1000000, nrows=999999)`` + + na_values : Hashable, Iterable of Hashable or dict of {Hashable : Iterable}, + optional + Additional strings to recognize as + ``NA``/``NaN``. If ``dict`` passed, specific + per-column ``NA`` values. By default + the following values are interpreted as + ``NaN``: "", "#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN", + "-nan", "1.#IND", "1.#QNAN", "", "N/A", "NA", "NULL", "NaN", + "None", "n/a", "nan", "null". + + keep_default_na : bool, default True + Whether or not to include the default ``NaN`` values when + parsing the data. + Depending on whether ``na_values`` is + passed in, the behavior is as follows: + + * If ``keep_default_na`` is ``True``, and ``na_values`` are specified, + ``na_values`` + is appended to the default ``NaN`` values used for parsing. + * If ``keep_default_na`` is ``True``, and + ``na_values`` are not specified, only + the default ``NaN`` values are used for parsing. + * If ``keep_default_na`` is ``False`` + , and ``na_values`` are specified, only + the ``NaN`` values specified ``na_values`` are used for parsing. + * If ``keep_default_na`` is ``False``, and ``na_values`` + are not specified, no + strings will be parsed as ``NaN``. + + Note that if ``na_filter`` is passed in as ``False``, + the ``keep_default_na`` and + ``na_values`` parameters will be ignored. + + na_filter : bool, default True + Detect missing value markers (empty strings and + the value of ``na_values``). In + data without any ``NA`` values, passing ``na_filter=False`` + can improve the + performance of reading a large file. + + skip_blank_lines : bool, default True + If ``True``, skip over blank lines rather than + interpreting as ``NaN`` values. + + parse_dates : bool, None, list of Hashable, default None + The behavior is as follows: + + * ``bool``. If ``True`` -> try parsing the index. + * ``None``. Behaves like ``True`` if ``date_format`` is specified. + * ``list`` of ``int`` or names. e.g. If ``[1, 2, 3]`` -> try parsing + columns 1, 2, 3 + each as a separate date column. + + If a column or index cannot be represented as an array of ``datetime``, + say because of an unparsable value or a mixture of timezones, the column + or index will be returned unaltered as an ``object`` data type. For + non-standard ``datetime`` parsing, use :func:`~pandas.to_datetime` after + :func:`~pandas.read_csv`. + + Note: A fast-path exists for iso8601-formatted dates. + + date_format : str or dict of column -> format, optional + Format to use for parsing dates and/or times + when used in conjunction with ``parse_dates``. + The strftime to parse time, e.g. :const:`"%d/%m/%Y"`. See + `strftime documentation + `_ for more information on choices, though + note that :const:`"%f"` will parse all the way up to nanoseconds. + You can also pass: + + - "ISO8601", to parse any + `ISO8601 `_ + time string (not necessarily in exactly the same format); + - "mixed", to infer the format for each element + individually. This is risky, + and you should probably use it along with `dayfirst`. + + versionadded:: 2.0.0 + + dayfirst : bool, default False + DD/MM format dates, international and European format. + + cache_dates : bool, default True + If ``True``, use a cache of unique, converted + dates to apply the ``datetime`` + conversion. May produce significant speed-up when parsing duplicate + date strings, especially ones with timezone offsets. + + iterator : bool, default False + Return ``TextFileReader`` object for iteration or getting chunks with + ``get_chunk()``. + + chunksize : int, optional + Number of lines to read from the file + per chunk. Passing a value will cause the + function to return a ``TextFileReader`` object for iteration. + See the `IO Tools docs + `_ + for more information on ``iterator`` and ``chunksize``. + + compression : str or dict, default 'infer' + For on-the-fly decompression of on-disk data. If 'infer' and + 'filepath_or_buffer' is + path-like, then detect compression from the following extensions: '.gz', + '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or ' + .tar.bz2' (otherwise no compression). + If using 'zip' or 'tar', the ZIP file must contain only one data + file to be read in. + Set to ``None`` for no decompression. + Can also be a dict with key ``'method'`` set to one of {``'zip'``, + ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``} and other + key-value pairs are forwarded to ``zipfile.ZipFile``, + ``gzip.GzipFile``, ``bz2.BZ2File``, ``zstandard.ZstdDecompressor``, + ``lzma.LZMAFile`` or ``tarfile.TarFile``, respectively. + As an example, the following could be passed for Zstandard + decompression using a custom compression dictionary: + ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``. + + versionadded:: 1.5.0 + Added support for `.tar` files. + + versionchanged:: 1.4.0 Zstandard support. + + thousands : str (length 1), optional + Character acting as the thousands separator in numerical values. + + decimal : str (length 1), default '.' + Character to recognize as decimal point (e.g., use ',' for European + data). + + lineterminator : str (length 1), optional + Character used to denote a line break. Only valid with C parser. + + quotechar : str (length 1), optional + Character used to denote the start and end of a quoted item. Quoted + items can include the ``delimiter`` and it will be ignored. + + quoting : {0 or csv.QUOTE_MINIMAL, 1 or csv.QUOTE_ALL, 2 or + csv.QUOTE_NONNUMERIC, 3 or csv.QUOTE_NONE}, default + csv.QUOTE_MINIMAL + Control field quoting behavior per ``csv.QUOTE_*`` constants. + Default is ``csv.QUOTE_MINIMAL`` (i.e., 0) which implies that only + fields containing special characters are quoted (e.g., characters + defined in ``quotechar``, ``delimiter``, or ``lineterminator``. + + doublequote : bool, default True + When ``quotechar`` is specified and ``quoting`` is not + ``QUOTE_NONE``, indicate whether or not to interpret two + consecutive ``quotechar`` elements INSIDE a field as a single + ``quotechar`` element. + + escapechar : str (length 1), optional + Character used to escape other characters. + + comment : str (length 1), optional + Character indicating that the remainder of line should not be + parsed. If found at the beginning of a line, the line will be + ignored altogether. This parameter must be a single character. + Like empty lines (as long as ``skip_blank_lines=True``), fully + commented lines are ignored by the parameter ``header`` but not by + ``skiprows``. For example, if ``comment='#'``, parsing + ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in + ``'a,b,c'`` being treated as the header. + + encoding : str, optional, default 'utf-8' + Encoding to use for UTF when reading/writing (ex. ``'utf-8'``). + `List of Python standard encodings + `_. + + encoding_errors : str, optional, default 'strict' + How encoding errors are treated. `List of possible values + `_. + + versionadded:: 1.3.0 + + dialect : str or csv.Dialect, optional + If provided, this parameter will override values (default or not) + for the following parameters: ``delimiter``, ``doublequote``, + ``escapechar``, ``skipinitialspace``, ``quotechar``, and + ``quoting``. If it is necessary to override values, a + ``ParserWarning`` will be issued. See ``csv.Dialect`` + documentation for more details. + + on_bad_lines : {'error', 'warn', 'skip'} or Callable, default 'error' + Specifies what to do upon encountering a bad line (a line with too + many fields). Allowed values are: + + - ``'error'``, raise an Exception when a bad line is encountered. + - ``'warn'``, raise a warning when a bad line is encountered and + skip that line. + - ``'skip'``, skip bad lines without raising or warning when they + are encountered. + - Callable, function that will process a single bad line. + - With ``engine='python'``, function with signature + ``(bad_line: list[str]) -> list[str] | None``. + ``bad_line`` is a list of strings split by the ``sep``. + If the function returns ``None``, the bad line will be + ignored. + If the function returns a new ``list`` of strings with more + elements than expected, a ``ParserWarning`` will be emitted + while dropping extra elements. + - With ``engine='pyarrow'``, function with signature as + described in pyarrow documentation: `invalid_row_handler + #pyarrow.csv.ParseOptions.invalid_row_handler>`_. + + versionadded:: 1.3.0 + + versionadded:: 1.4.0 + Callable + + versionchanged:: 2.2.0 + Callable for ``engine='pyarrow'`` + + low_memory : bool, default True + Internally process the file in chunks, resulting in lower memory + use while parsing, but possibly mixed type inference. To ensure + no mixed types either set ``False``, or specify the type with the + ``dtype`` parameter. Note that the entire file is read into a + single :class:`~pandas.DataFrame` regardless, use the + ``chunksize`` or ``iterator`` parameter to return the data in + chunks. (Only valid with C parser). + + memory_map : bool, default False + If a filepath is provided for ``filepath_or_buffer``, map the file + object directly onto memory and access the data directly from + there. Using this option can improve performance because there is + no longer any I/O overhead. + + float_precision : {'high', 'legacy', 'round_trip'}, optional + Specifies which converter the C engine should use for + floating-point values. The options are ``None`` or ``'high'`` for + the ordinary converter, ``'legacy'`` for the original lower + precision pandas converter, and ``'round_trip'`` for the + round-trip converter. + + storage_options : dict, optional + Extra options that make sense for a particular storage connection, + e.g. host, port, username, password, etc. For HTTP(S) URLs the + key-value pairs are forwarded to ``urllib.request.Request`` as + header options. For other URLs (e.g. starting with "s3://", and + "gcs://") the key-value pairs are forwarded to ``fsspec.open``. + Please see ``fsspec`` and ``urllib`` for more details, and for + more examples on storage options refer `here + `_. + + dtype_backend : {'numpy_nullable', 'pyarrow'} + Back-end data type applied to the resultant :class:`DataFrame` + (still experimental). If not specified, the default behavior is + to not use nullable data types. If specified, the behavior is as + follows: + + * ``"numpy_nullable"``: returns nullable-dtype-backed + :class:`DataFrame` + * ``"pyarrow"``: returns pyarrow-backed nullable + :class:`ArrowDtype` :class:`DataFrame` + + versionadded:: 2.0 + + Returns + ------- + + DataFrame or TextFileReader + A comma-separated values (csv) file is returned as two-dimensional + data structure with labeled axes. + + See Also + -------- + DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) + file. + read_table : Read general delimited file into DataFrame. + read_fwf : Read a table of fixed-width formatted lines into DataFrame. + + Examples + -------- + >>> pd.read_csv("data.csv") # doctest: +SKIP + Name Value + 0 foo 1 + 1 bar 2 + 2 #baz 3 + + Index and header can be specified via the `index_col` and `header` + arguments. + + >>> pd.read_csv("data.csv", header=None) # doctest: +SKIP + 0 1 + 0 Name Value + 1 foo 1 + 2 bar 2 + 3 #baz 3 + + >>> pd.read_csv("data.csv", index_col="Value") # doctest: +SKIP + Name + Value + 1 foo + 2 bar + 3 #baz + + Column types are inferred but can be explicitly specified using the + `dtype` argument. + + >>> pd.read_csv("data.csv", dtype={"Value": float}) # doctest: +SKIP + Name Value + 0 foo 1.0 + 1 bar 2.0 + 2 #baz 3.0 + + True, False, and NA values, and thousands separators have defaults, + but can be explicitly specified, too. Supply the values you would like + as strings or lists of strings! + + >>> pd.read_csv("data.csv", na_values=["foo", "bar"]) # doctest: +SKIP + Name Value + 0 NaN 1 + 1 NaN 2 + 2 #baz 3 + + Comment lines in the input file can be skipped using the `comment` + argument. + + >>> pd.read_csv("data.csv", comment="#") # doctest: +SKIP + Name Value + 0 foo 1 + 1 bar 2 + + By default, columns with dates will be read as ``object`` rather than + ``datetime``. + + >>> df = pd.read_csv("tmp.csv") # doctest: +SKIP + + >>> df # doctest: +SKIP + col 1 col 2 col 3 + 0 10 10/04/2018 Sun 15 Jan 2023 + 1 20 15/04/2018 Fri 12 May 2023 + + >>> df.dtypes # doctest: +SKIP + col 1 int64 + col 2 object + col 3 object + dtype: object + + Specific columns can be parsed as dates by using the `parse_dates` and + `date_format` arguments. + + >>> df = pd.read_csv( + "tmp.csv", + parse_dates=[1, 2], + date_format={"col 2": "%d/%m/%Y", "col 3": "%a %d %b %Y"}, + ) # doctest: +SKIP + + >>> df.dtypes # doctest: +SKIP + col 1 int64 + col 2 datetime64[ns] + col 3 datetime64[ns] + dtype: object + """ # locals() should never be modified kwds = locals().copy() del kwds["filepath_or_buffer"] @@ -1441,524 +1463,525 @@ def read_table( dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, ) -> DataFrame | TextFileReader: """ - Read general delimited file into DataFrame. - - Also supports optionally iterating or breaking of the file - into chunks. - - Additional help can be found in the online docs for - `IO Tools `_. - - Parameters - ---------- - - filepath_or_buffer : str, path object or file-like object - Any valid string path is acceptable. The string could be a URL. Valid - URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is - expected. A local file could be: file://localhost/path/to/table.csv. - - If you want to pass in a path object, pandas accepts any ``os.PathLike``. - - By file-like object, we refer to objects with a ``read()`` method, such as - a file handle (e.g. via builtin ``open`` function) or ``StringIO``. - - sep : str, default '\\t' (tab-stop) - Character or regex pattern to treat as the delimiter. If ``sep=None``, the - C engine cannot automatically detect - the separator, but the Python parsing engine can, meaning the latter will - be used and automatically detect the separator from only the first valid - row of the file by Python's builtin sniffer tool, ``csv.Sniffer``. - - In addition, separators longer than 1 character and different from - ``'\\s+'`` will be interpreted as regular expressions and will also force - the use of the Python parsing engine. Note that regex delimiters are prone - to ignoring quoted data. Regex example: ``'\r\t'``. - - delimiter : str, optional - Alias for ``sep``. - - header : int, Sequence of int, 'infer' or None, default 'infer' - Row number(s) containing column labels and marking the start of the - data (zero-indexed). Default behavior is to infer the column names: - if no ``names`` are passed the behavior is identical to ``header=0`` - and column names are inferred from the first line of the file, if - column names are passed explicitly to ``names`` then the behavior is - identical to ``header=None``. Explicitly pass ``header=0`` to be - able to replace existing names. The header can be a list of integers - that specify row locations for a :class:`~pandas.MultiIndex` on the - columns e.g. ``[0, 1, 3]``. Intervening rows that are not specified - will be skipped (e.g. 2 in this example is skipped). Note that this - parameter ignores commented lines and empty lines if - ``skip_blank_lines=True``, so ``header=0`` denotes the first line of - data rather than the first line of the file. - - When inferred from the file contents, headers are kept distinct from - each other by renaming duplicate names with a numeric suffix of the - form ``".{count}"`` starting from 1, e.g. ``"foo"`` and ``"foo.1"``. - Empty headers are named ``"Unnamed: {i}"`` or - ``"Unnamed: {i}_level_{level}"`` in the case of MultiIndex columns. - - names : Sequence of Hashable, optional - Sequence of column labels to apply. If the file contains a header row, - then you should explicitly pass ``header=0`` to override the column - names. Duplicates in this list are not allowed. - - index_col : Hashable, Sequence of Hashable or False, optional - Column(s) to use as row label(s), denoted either by column labels or - column indices. If a sequence of labels or indices is given, - :class:`~pandas.MultiIndex` will be formed for the row labels. - - Note: ``index_col=False`` can be used to force pandas to *not* use - the first column as the index, e.g., when you have a malformed file - with delimiters at the end of each line. - - usecols : Sequence of Hashable or Callable, optional - Subset of columns to select, denoted either by column labels or - column indices. If list-like, all elements must either - be positional (i.e. integer indices into the document columns) or - strings that correspond to column names provided either by the user - in ``names`` or inferred from the document header row(s). If - ``names`` are given, the document header row(s) are not taken into - account. For example, a valid list-like ``usecols`` parameter would - be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``. Element order is - ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. To - instantiate a :class:`~pandas.DataFrame` from ``data`` with element - order preserved use - ``pd.read_table(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` - for columns in ``['foo', 'bar']`` order or - ``pd.read_table(data, usecols=['foo', 'bar'])[['bar', 'foo']]`` - for ``['bar', 'foo']`` order. - - If callable, the callable function will be evaluated against the - column names, returning names where the callable function evaluates - to ``True``. An example of a valid callable argument would be - ``lambda x: x.upper() in ['AAA', 'BBB', 'DDD']``. Using this - parameter results in much faster parsing time and lower memory usage. - - dtype : dtype or dict of {Hashable : dtype}, optional - Data type(s) to apply to either the whole dataset or individual - columns. E.g., ``{'a': np.float64, 'b': np.int32, 'c': 'Int64'}`` - Use ``str`` or ``object`` together with suitable ``na_values`` - settings to preserve and not interpret ``dtype``. - If ``converters`` are specified, they will be applied INSTEAD - of ``dtype`` conversion. - - versionadded:: 1.5.0 - Support for ``defaultdict`` was added. Specify a ``defaultdict`` - as input where the default determines the ``dtype`` of the - columns which are not explicitly listed. - - engine : {'c', 'python', 'pyarrow'}, optional - Parser engine to use. The C and pyarrow engines are faster, while - the python engine is currently more feature-complete. Multithreading - is currently only supported by the pyarrow engine. - - versionadded:: 1.4.0 - The 'pyarrow' engine was added as an *experimental* engine, and - some features are unsupported, or may not work correctly, with - this engine. - - converters : dict of {Hashable : Callable}, optional - Functions for converting values in specified columns. Keys can either - be column labels or column indices. - - true_values : list, optional - Values to consider as ``True`` in addition to case-insensitive - variants of 'True'. - - false_values : list, optional - Values to consider as ``False`` in addition to case-insensitive - variants of 'False'. - - skipinitialspace : bool, default False - Skip spaces after delimiter. - - skiprows : int, list of int or Callable, optional - Line numbers to skip (0-indexed) or number of lines to skip (``int``) - at the start of the file. - - If callable, the callable function will be evaluated against the row - indices, returning ``True`` if the row should be skipped and ``False`` - otherwise. An example of a valid callable argument would be - ``lambda x: x in [0, 2]``. - - skipfooter : int, default 0 - Number of lines at bottom of file to skip (Unsupported with - ``engine='c'``). - - nrows : int, optional - Number of rows of file to read. Useful for reading pieces of large - files. Refers to the number of data rows in the returned DataFrame, - excluding: + Read general delimited file into DataFrame. - â–ª The header row containing column names. - - â–ª Rows before the header row, if ``header=1`` or larger. + Also supports optionally iterating or breaking of the file + into chunks. + Additional help can be found in the online docs for + `IO Tools `_. - Example usage: + Parameters + ---------- - â–ª To read the first 999,999 (non-header) rows: + filepath_or_buffer : str, path object or file-like object + Any valid string path is acceptable. The string could be a URL. Valid + URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is + expected. A local file could be: file://localhost/path/to/table.csv. + + If you want to pass in a path object, pandas accepts any ``os.PathLike``. + + By file-like object, we refer to objects with a ``read()`` method, such as + a file handle (e.g. via builtin ``open`` function) or ``StringIO``. + + sep : str, default '\\t' (tab-stop) + Character or regex pattern to treat as the delimiter. If ``sep=None``, the + C engine cannot automatically detect + the separator, but the Python parsing engine can, meaning the latter will + be used and automatically detect the separator from only the first valid + row of the file by Python's builtin sniffer tool, ``csv.Sniffer``. + + In addition, separators longer than 1 character and different from + ``'\\s+'`` will be interpreted as regular expressions and will also force + the use of the Python parsing engine. Note that regex delimiters are prone + to ignoring quoted data. Regex example: ``'\r\t'``. + + delimiter : str, optional + Alias for ``sep``. + + header : int, Sequence of int, 'infer' or None, default 'infer' + Row number(s) containing column labels and marking the start of the + data (zero-indexed). Default behavior is to infer the column names: + if no ``names`` are passed the behavior is identical to ``header=0`` + and column names are inferred from the first line of the file, if + column names are passed explicitly to ``names`` then the behavior is + identical to ``header=None``. Explicitly pass ``header=0`` to be + able to replace existing names. The header can be a list of integers + that specify row locations for a :class:`~pandas.MultiIndex` on the + columns e.g. ``[0, 1, 3]``. Intervening rows that are not specified + will be skipped (e.g. 2 in this example is skipped). Note that this + parameter ignores commented lines and empty lines if + ``skip_blank_lines=True``, so ``header=0`` denotes the first line of + data rather than the first line of the file. + + When inferred from the file contents, headers are kept distinct from + each other by renaming duplicate names with a numeric suffix of the + form ``".{count}"`` starting from 1, e.g. ``"foo"`` and ``"foo.1"``. + Empty headers are named ``"Unnamed: {i}"`` or + ``"Unnamed: {i}_level_{level}"`` in the case of MultiIndex columns. + + names : Sequence of Hashable, optional + Sequence of column labels to apply. If the file contains a header row, + then you should explicitly pass ``header=0`` to override the column + names. Duplicates in this list are not allowed. + + index_col : Hashable, Sequence of Hashable or False, optional + Column(s) to use as row label(s), denoted either by column labels or + column indices. If a sequence of labels or indices is given, + :class:`~pandas.MultiIndex` will be formed for the row labels. + + Note: ``index_col=False`` can be used to force pandas to *not* use + the first column as the index, e.g., when you have a malformed file + with delimiters at the end of each line. + + usecols : Sequence of Hashable or Callable, optional + Subset of columns to select, denoted either by column labels or + column indices. If list-like, all elements must either + be positional (i.e. integer indices into the document columns) or + strings that correspond to column names provided either by the user + in ``names`` or inferred from the document header row(s). If + ``names`` are given, the document header row(s) are not taken into + account. For example, a valid list-like ``usecols`` parameter would + be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``. Element order is + ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. To + instantiate a :class:`~pandas.DataFrame` from ``data`` with element + order preserved use + ``pd.read_table(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` + for columns in ``['foo', 'bar']`` order or + ``pd.read_table(data, usecols=['foo', 'bar'])[['bar', 'foo']]`` + for ``['bar', 'foo']`` order. + + If callable, the callable function will be evaluated against the + column names, returning names where the callable function evaluates + to ``True``. An example of a valid callable argument would be + ``lambda x: x.upper() in ['AAA', 'BBB', 'DDD']``. Using this + parameter results in much faster parsing time and lower memory usage. + + dtype : dtype or dict of {Hashable : dtype}, optional + Data type(s) to apply to either the whole dataset or individual + columns. E.g., ``{'a': np.float64, 'b': np.int32, 'c': 'Int64'}`` + Use ``str`` or ``object`` together with suitable ``na_values`` + settings to preserve and not interpret ``dtype``. + If ``converters`` are specified, they will be applied INSTEAD + of ``dtype`` conversion. + + versionadded:: 1.5.0 + Support for ``defaultdict`` was added. Specify a ``defaultdict`` + as input where the default determines the ``dtype`` of the + columns which are not explicitly listed. + + engine : {'c', 'python', 'pyarrow'}, optional + Parser engine to use. The C and pyarrow engines are faster, while + the python engine is currently more feature-complete. Multithreading + is currently only supported by the pyarrow engine. + + versionadded:: 1.4.0 + The 'pyarrow' engine was added as an *experimental* engine, and + some features are unsupported, or may not work correctly, with + this engine. + + converters : dict of {Hashable : Callable}, optional + Functions for converting values in specified columns. Keys can either + be column labels or column indices. + + true_values : list, optional + Values to consider as ``True`` in addition to case-insensitive + variants of 'True'. + + false_values : list, optional + Values to consider as ``False`` in addition to case-insensitive + variants of 'False'. + + skipinitialspace : bool, default False + Skip spaces after delimiter. + + skiprows : int, list of int or Callable, optional + Line numbers to skip (0-indexed) or number of lines to skip (``int``) + at the start of the file. + + If callable, the callable function will be evaluated against the row + indices, returning ``True`` if the row should be skipped and ``False`` + otherwise. An example of a valid callable argument would be + ``lambda x: x in [0, 2]``. + + skipfooter : int, default 0 + Number of lines at bottom of file to skip (Unsupported with + ``engine='c'``). + + nrows : int, optional + Number of rows of file to read. Useful for reading pieces of large + files. Refers to the number of data rows in the returned DataFrame, + excluding: + + â–ª The header row containing column names. + + â–ª Rows before the header row, if ``header=1`` or larger. + + + Example usage: + + â–ª To read the first 999,999 (non-header) rows: + + ``read_table(..., nrows=999999)`` + + â–ª To read rows 1,000,000 through 1,999,999: + + ``read_table(..., skiprows=1000000, nrows=999999)`` + + na_values : Hashable, Iterable of Hashable or dict of {Hashable : + Iterable}, optional + Additional strings to recognize as ``NA``/``NaN``. If ``dict`` + passed, specific per-column ``NA`` values. By default the following + values are interpreted as ``NaN``: "", "#N/A", "#N/A N/A", "#NA", + "-1.#IND", "-1.#QNAN", "-NaN", "-nan", "1.#IND", "1.#QNAN", "", + "N/A", "NA", "NULL", "NaN", "None", "n/a", "nan", "null". - ``read_table(..., nrows=999999)`` + keep_default_na : bool, default True + Whether or not to include the default ``NaN`` values when parsing + the data. Depending on whether ``na_values`` is passed in, the + behavior is as follows: + + â–ª If ``keep_default_na`` is ``True``, and ``na_values`` are + + specified, ``na_values`` is appended to the default ``NaN`` values + used for parsing. + â–ª If ``keep_default_na`` is ``True``, and ``na_values`` are not + + specified, only the default ``NaN`` values are used for parsing. + â–ª If ``keep_default_na`` is ``False``, and ``na_values`` are - â–ª To read rows 1,000,000 through 1,999,999: - - ``read_table(..., skiprows=1000000, nrows=999999)`` + specified, only the ``NaN`` values specified ``na_values`` are + used for parsing. + â–ª If ``keep_default_na`` is ``False``, and ``na_values`` are not - na_values : Hashable, Iterable of Hashable or dict of {Hashable : - Iterable}, optional - Additional strings to recognize as ``NA``/``NaN``. If ``dict`` - passed, specific per-column ``NA`` values. By default the following - values are interpreted as ``NaN``: "", "#N/A", "#N/A N/A", "#NA", - "-1.#IND", "-1.#QNAN", "-NaN", "-nan", "1.#IND", "1.#QNAN", "", - "N/A", "NA", "NULL", "NaN", "None", "n/a", "nan", "null". - - keep_default_na : bool, default True - Whether or not to include the default ``NaN`` values when parsing - the data. Depending on whether ``na_values`` is passed in, the - behavior is as follows: - - â–ª If ``keep_default_na`` is ``True``, and ``na_values`` are - - specified, ``na_values`` is appended to the default ``NaN`` values - used for parsing. - â–ª If ``keep_default_na`` is ``True``, and ``na_values`` are not + specified, no strings will be parsed as ``NaN``. - specified, only the default ``NaN`` values are used for parsing. - â–ª If ``keep_default_na`` is ``False``, and ``na_values`` are + Note that if ``na_filter`` is passed in as ``False``, the + ``keep_default_na`` and ``na_values`` parameters will be ignored. - specified, only the ``NaN`` values specified ``na_values`` are - used for parsing. - â–ª If ``keep_default_na`` is ``False``, and ``na_values`` are not + na_filter : bool, default True + Detect missing value markers (empty strings and the value of + ``na_values``). In data without any ``NA`` values, passing + ``na_filter=False`` can improve the performance of reading a large + file. - specified, no strings will be parsed as ``NaN``. + skip_blank_lines : bool, default True + If ``True``, skip over blank lines rather than interpreting as + ``NaN`` values. - Note that if ``na_filter`` is passed in as ``False``, the - ``keep_default_na`` and ``na_values`` parameters will be ignored. + parse_dates : bool, None, list of Hashable, default None + The behavior is as follows: - na_filter : bool, default True - Detect missing value markers (empty strings and the value of - ``na_values``). In data without any ``NA`` values, passing - ``na_filter=False`` can improve the performance of reading a large - file. + â–ª ``bool``. If ``True`` -> try parsing the index. - skip_blank_lines : bool, default True - If ``True``, skip over blank lines rather than interpreting as - ``NaN`` values. + â–ª ``None``. Behaves like ``True`` if ``date_format`` is specified. - parse_dates : bool, None, list of Hashable, default None - The behavior is as follows: + â–ª ``list`` of ``int`` or names. e.g. If ``[1, 2, 3]`` -> try - â–ª ``bool``. If ``True`` -> try parsing the index. + parsing columns 1, 2, 3 each as a separate date column. - â–ª ``None``. Behaves like ``True`` if ``date_format`` is specified. + If a column or index cannot be represented as an array of + ``datetime``, say because of an unparsable value or a mixture of + timezones, the column or index will be returned unaltered as an + ``object`` data type. For non-standard ``datetime`` parsing, use + :func:`~pandas.to_datetime` after :func:`~pandas.read_table`. - â–ª ``list`` of ``int`` or names. e.g. If ``[1, 2, 3]`` -> try + Note: A fast-path exists for iso8601-formatted dates. - parsing columns 1, 2, 3 each as a separate date column. + date_format : str or dict of column -> format, optional + Format to use for parsing dates and/or times when used in conjunction + with ``parse_dates``. The strftime to parse time, e.g. + :const:`"%d/%m/%Y"`. See `strftime documentation + `_ for more information on choices, + though note that :const:`"%f"`` will parse all the way up to + nanoseconds. You can also pass: - If a column or index cannot be represented as an array of - ``datetime``, say because of an unparsable value or a mixture of - timezones, the column or index will be returned unaltered as an - ``object`` data type. For non-standard ``datetime`` parsing, use - :func:`~pandas.to_datetime` after :func:`~pandas.read_table`. + â–ª "ISO8601", to parse any `ISO8601 - Note: A fast-path exists for iso8601-formatted dates. + `_ time string (not + necessarily in exactly the same format); + â–ª "mixed", to infer the format for each element individually. This - date_format : str or dict of column -> format, optional - Format to use for parsing dates and/or times when used in conjunction - with ``parse_dates``. The strftime to parse time, e.g. - :const:`"%d/%m/%Y"`. See `strftime documentation - `_ for more information on choices, - though note that :const:`"%f"`` will parse all the way up to - nanoseconds. You can also pass: + is risky, and you should probably use it along with `dayfirst`. - â–ª "ISO8601", to parse any `ISO8601 + versionadded:: 2.0.0 - `_ time string (not - necessarily in exactly the same format); - â–ª "mixed", to infer the format for each element individually. This + dayfirst : bool, default False + DD/MM format dates, international and European format. - is risky, and you should probably use it along with `dayfirst`. + cache_dates : bool, default True + If ``True``, use a cache of unique, converted dates to apply the + ``datetime`` conversion. May produce significant speed-up when + parsing duplicate date strings, especially ones with timezone + offsets. - versionadded:: 2.0.0 + iterator : bool, default False + Return ``TextFileReader`` object for iteration or getting chunks + with ``get_chunk()``. - dayfirst : bool, default False - DD/MM format dates, international and European format. + chunksize : int, optional + Number of lines to read from the file per chunk. Passing a value + will cause the function to return a ``TextFileReader`` object for + iteration. See the `IO Tools docs + `_ + for more information on ``iterator`` and ``chunksize``. + + compression : str or dict, default 'infer' + For on-the-fly decompression of on-disk data. If 'infer' and + 'filepath_or_buffer' is path-like, then detect compression from the + following extensions: '.gz', '.bz2', '.zip', '.xz', '.zst', '.tar', + '.tar.gz', '.tar.xz' or '.tar.bz2' (otherwise no compression). + If using 'zip' or 'tar', the ZIP file must contain only one data + file to be read in. Set to ``None`` for no decompression. + Can also be a dict with key ``'method'`` set to one of + {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``} + and other key-value pairs are forwarded to ``zipfile.ZipFile``, + ``gzip.GzipFile``, ``bz2.BZ2File``, ``zstandard.ZstdDecompressor``, + ``lzma.LZMAFile`` or ``tarfile.TarFile``, respectively. + As an example, the following could be passed for Zstandard + decompression using a custom compression dictionary: + ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``. + + versionadded:: 1.5.0 + Added support for `.tar` files. + + versionchanged:: 1.4.0 Zstandard support. + + thousands : str (length 1), optional + Character acting as the thousands separator in numerical values. + + decimal : str (length 1), default '.' + Character to recognize as decimal point (e.g., use ',' for European + data). + + lineterminator : str (length 1), optional + Character used to denote a line break. Only valid with C parser. + + quotechar : str (length 1), default '"' + Character used to denote the start and end of a quoted item. Quoted + items can include the ``delimiter`` and it will be ignored. + + quoting : {0 or csv.QUOTE_MINIMAL, 1 or csv.QUOTE_ALL, + 2 or csv.QUOTE_NONNUMERIC, 3 or csv.QUOTE_NONE}, default + csv.QUOTE_MINIMAL + Control field quoting behavior per ``csv.QUOTE_*`` constants. + Default is ``csv.QUOTE_MINIMAL`` (i.e., 0) which implies that only + fields containing special characters are quoted (e.g., characters + defined in ``quotechar``, ``delimiter``, or ``lineterminator``. + + doublequote : bool, default True + When ``quotechar`` is specified and ``quoting`` is not ``QUOTE_NONE``, + indicate whether or not to interpret two consecutive ``quotechar`` + elements INSIDE a field as a single ``quotechar`` element. + + escapechar : str (length 1), optional + Character used to escape other characters. + + comment : str (length 1), optional + Character indicating that the remainder of line should not be + parsed. If found at the beginning of a line, the line will be + ignored altogether. This parameter must be a single character. Like + empty lines (as long as ``skip_blank_lines=True``), fully commented + lines are ignored by the parameter ``header`` but not by + ``skiprows``. For example, if ``comment='#'``, parsing + ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in + ``'a,b,c'`` being treated as the header. + + encoding : str, optional, default 'utf-8' + Encoding to use for UTF when reading/writing (ex. ``'utf-8'``). + `List of Python standard encodings + `_. + + encoding_errors : str, optional, default 'strict' + How encoding errors are treated. `List of possible values + `_. + + versionadded:: 1.3.0 + + dialect : str or csv.Dialect, optional + If provided, this parameter will override values (default or not) + for the following parameters: ``delimiter``, ``doublequote``, + ``escapechar``, ``skipinitialspace``, ``quotechar``, and ``quoting``. + If it is necessary to override values, a ``ParserWarning`` will be + issued. See ``csv.Dialect`` documentation for more details. + + on_bad_lines : {'error', 'warn', 'skip'} or Callable, default 'error' + Specifies what to do upon encountering a bad line (a line with too + many fields). Allowed values are: + + â–ª ``'error'``, raise an Exception when a bad line is encountered. + + â–ª ``'warn'``, raise a warning when a bad line is encountered and + + skip that line. + â–ª ``'skip'``, skip bad lines without raising or warning when they + + are encountered. + â–ª Callable, function that will process a single bad line. + + â–ª With ``engine='python'``, function with signature + + ``(bad_line: list[str]) -> list[str] | None``. + ``bad_line`` is a list of strings split by the ``sep``. + If the function returns ``None``, the bad line will be ignored. + If the function returns a new ``list`` of strings with more + elements than expected, a ``ParserWarning`` will be emitted + while dropping extra elements. + â–ª With ``engine='pyarrow'``, function with signature + + as described in pyarrow documentation: `invalid_row_handler + #pyarrow.csv.ParseOptions.invalid_row_handler>`_. + + versionadded:: 1.3.0 + + versionadded:: 1.4.0 + Callable + + versionchanged:: 2.2.0 + Callable for ``engine='pyarrow'`` + + low_memory : bool, default True + Internally process the file in chunks, resulting in lower memory use + while parsing, but possibly mixed type inference. To ensure no + mixed types either set ``False``, or specify the type with the + ``dtype`` parameter. Note that the entire file is read into a single + :class:`~pandas.DataFrame` regardless, use the ``chunksize`` or + ``iterator`` parameter to return the data in chunks. (Only valid + with C parser). + + memory_map : bool, default False + If a filepath is provided for ``filepath_or_buffer``, map the file + object directly onto memory and access the data directly from there. + Using this option can improve performance because there is no longer + any I/O overhead. + + float_precision : {'high', 'legacy', 'round_trip'}, optional + Specifies which converter the C engine should use for floating-point + values. The options are ``None`` or ``'high'`` for the ordinary + converter, ``'legacy'`` for the original lower precision pandas + converter, and ``'round_trip'`` for the round-trip converter. - cache_dates : bool, default True - If ``True``, use a cache of unique, converted dates to apply the - ``datetime`` conversion. May produce significant speed-up when - parsing duplicate date strings, especially ones with timezone - offsets. + storage_options : dict, optional + Extra options that make sense for a particular storage connection, + e.g. host, port, username, password, etc. For HTTP(S) URLs the + key-value pairs are forwarded to ``urllib.request.Request`` as + header options. For other URLs (e.g. starting with "s3://", and + "gcs://") the key-value pairs are forwarded to ``fsspec.open``. + Please see ``fsspec`` and ``urllib`` for more details, and for more + examples on storage options refer `here + `_. - iterator : bool, default False - Return ``TextFileReader`` object for iteration or getting chunks - with ``get_chunk()``. + dtype_backend : {'numpy_nullable', 'pyarrow'} + Back-end data type applied to the resultant :class:`DataFrame` + (still experimental). If not specified, the default behavior + is to not use nullable data types. If specified, the behavior + is as follows: - chunksize : int, optional - Number of lines to read from the file per chunk. Passing a value - will cause the function to return a ``TextFileReader`` object for - iteration. See the `IO Tools docs - `_ - for more information on ``iterator`` and ``chunksize``. - - compression : str or dict, default 'infer' - For on-the-fly decompression of on-disk data. If 'infer' and - 'filepath_or_buffer' is path-like, then detect compression from the - following extensions: '.gz', '.bz2', '.zip', '.xz', '.zst', '.tar', - '.tar.gz', '.tar.xz' or '.tar.bz2' (otherwise no compression). - If using 'zip' or 'tar', the ZIP file must contain only one data - file to be read in. Set to ``None`` for no decompression. - Can also be a dict with key ``'method'`` set to one of - {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``} - and other key-value pairs are forwarded to ``zipfile.ZipFile``, - ``gzip.GzipFile``, ``bz2.BZ2File``, ``zstandard.ZstdDecompressor``, - ``lzma.LZMAFile`` or ``tarfile.TarFile``, respectively. - As an example, the following could be passed for Zstandard - decompression using a custom compression dictionary: - ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``. + â–ª ``"numpy_nullable"``: returns nullable-dtype-backed - versionadded:: 1.5.0 - Added support for `.tar` files. - - versionchanged:: 1.4.0 Zstandard support. - - thousands : str (length 1), optional - Character acting as the thousands separator in numerical values. - - decimal : str (length 1), default '.' - Character to recognize as decimal point (e.g., use ',' for European - data). - - lineterminator : str (length 1), optional - Character used to denote a line break. Only valid with C parser. - - quotechar : str (length 1), default '"' - Character used to denote the start and end of a quoted item. Quoted - items can include the ``delimiter`` and it will be ignored. - - quoting : {0 or csv.QUOTE_MINIMAL, 1 or csv.QUOTE_ALL, - 2 or csv.QUOTE_NONNUMERIC, 3 or csv.QUOTE_NONE}, default - csv.QUOTE_MINIMAL - Control field quoting behavior per ``csv.QUOTE_*`` constants. - Default is ``csv.QUOTE_MINIMAL`` (i.e., 0) which implies that only - fields containing special characters are quoted (e.g., characters - defined in ``quotechar``, ``delimiter``, or ``lineterminator``. - - doublequote : bool, default True - When ``quotechar`` is specified and ``quoting`` is not ``QUOTE_NONE``, - indicate whether or not to interpret two consecutive ``quotechar`` - elements INSIDE a field as a single ``quotechar`` element. - - escapechar : str (length 1), optional - Character used to escape other characters. - - comment : str (length 1), optional - Character indicating that the remainder of line should not be - parsed. If found at the beginning of a line, the line will be - ignored altogether. This parameter must be a single character. Like - empty lines (as long as ``skip_blank_lines=True``), fully commented - lines are ignored by the parameter ``header`` but not by - ``skiprows``. For example, if ``comment='#'``, parsing - ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in - ``'a,b,c'`` being treated as the header. - - encoding : str, optional, default 'utf-8' - Encoding to use for UTF when reading/writing (ex. ``'utf-8'``). - `List of Python standard encodings - `_. - - encoding_errors : str, optional, default 'strict' - How encoding errors are treated. `List of possible values - `_. - - versionadded:: 1.3.0 - - dialect : str or csv.Dialect, optional - If provided, this parameter will override values (default or not) - for the following parameters: ``delimiter``, ``doublequote``, - ``escapechar``, ``skipinitialspace``, ``quotechar``, and ``quoting``. - If it is necessary to override values, a ``ParserWarning`` will be - issued. See ``csv.Dialect`` documentation for more details. - - on_bad_lines : {'error', 'warn', 'skip'} or Callable, default 'error' - Specifies what to do upon encountering a bad line (a line with too - many fields). Allowed values are: - - â–ª ``'error'``, raise an Exception when a bad line is encountered. - - â–ª ``'warn'``, raise a warning when a bad line is encountered and - - skip that line. - â–ª ``'skip'``, skip bad lines without raising or warning when they - - are encountered. - â–ª Callable, function that will process a single bad line. - - â–ª With ``engine='python'``, function with signature - - ``(bad_line: list[str]) -> list[str] | None``. - ``bad_line`` is a list of strings split by the ``sep``. - If the function returns ``None``, the bad line will be ignored. - If the function returns a new ``list`` of strings with more - elements than expected, a ``ParserWarning`` will be emitted - while dropping extra elements. - â–ª With ``engine='pyarrow'``, function with signature - - as described in pyarrow documentation: `invalid_row_handler - `_. - - versionadded:: 1.3.0 - - versionadded:: 1.4.0 - Callable - - versionchanged:: 2.2.0 - Callable for ``engine='pyarrow'`` - - low_memory : bool, default True - Internally process the file in chunks, resulting in lower memory use - while parsing, but possibly mixed type inference. To ensure no - mixed types either set ``False``, or specify the type with the - ``dtype`` parameter. Note that the entire file is read into a single - :class:`~pandas.DataFrame` regardless, use the ``chunksize`` or - ``iterator`` parameter to return the data in chunks. (Only valid - with C parser). - - memory_map : bool, default False - If a filepath is provided for ``filepath_or_buffer``, map the file - object directly onto memory and access the data directly from there. - Using this option can improve performance because there is no longer - any I/O overhead. - - float_precision : {'high', 'legacy', 'round_trip'}, optional - Specifies which converter the C engine should use for floating-point - values. The options are ``None`` or ``'high'`` for the ordinary - converter, ``'legacy'`` for the original lower precision pandas - converter, and ``'round_trip'`` for the round-trip converter. - - storage_options : dict, optional - Extra options that make sense for a particular storage connection, - e.g. host, port, username, password, etc. For HTTP(S) URLs the - key-value pairs are forwarded to ``urllib.request.Request`` as - header options. For other URLs (e.g. starting with "s3://", and - "gcs://") the key-value pairs are forwarded to ``fsspec.open``. - Please see ``fsspec`` and ``urllib`` for more details, and for more - examples on storage options refer `here - `_. - - dtype_backend : {'numpy_nullable', 'pyarrow'} - Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). If not specified, the default behavior - is to not use nullable data types. If specified, the behavior - is as follows: - - â–ª ``"numpy_nullable"``: returns nullable-dtype-backed - - :class:`DataFrame` - â–ª ``"pyarrow"``: returns pyarrow-backed nullable - - :class:`ArrowDtype` :class:`DataFrame` - - versionadded:: 2.0 - - Returns - ------- - - DataFrame or TextFileReader - A general delimited file is returned as two-dimensional - data structure with labeled axes. - - See Also - -------- - DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) - file. - read_csv : Read a comma-separated values (csv) file into DataFrame. - read_fwf : Read a table of fixed-width formatted lines into DataFrame. - - Examples - -------- - >>> pd.read_table("data.txt") # doctest: +SKIP - Name Value - 0 foo 1 - 1 bar 2 - 2 #baz 3 - - Index and header can be specified via the `index_col` and `header` arguments. - - >>> pd.read_table("data.txt", header=None) # doctest: +SKIP - 0 1 - 0 Name Value - 1 foo 1 - 2 bar 2 - 3 #baz 3 - - >>> pd.read_table("data.txt", index_col="Value") # doctest: +SKIP - Name Value - 1 foo - 2 bar - 3 #baz - - Column types are inferred but can be explicitly specified using the dtype argument. - - >>> pd.read_table("data.txt", dtype={"Value": float}) # doctest: +SKIP - Name Value - 0 foo 1.0 - 1 bar 2.0 - 2 #baz 3.0 - - True, False, and NA values, and thousands separators have defaults, - but can be explicitly specified, too. Supply the values you would like - as strings or lists of strings! - - >>> pd.read_table("data.txt", na_values=["foo", "bar"]) # doctest: +SKIP - Name Value - 0 NaN 1 - 1 NaN 2 - 2 #baz 3 - - Comment lines in the input file can be skipped using the `comment` argument. - - >>> pd.read_table("data.txt", comment="#") # doctest: +SKIP - Name Value - 0 foo 1 - 1 bar 2 - - By default, columns with dates will be read as ``object`` rather than ``datetime``. - - >>> df = pd.read_table("tmp.txt") # doctest: +SKIP - - >>> df # doctest: +SKIP - col 1 col 2 col 3 - 0 10 10/04/2018 Sun 15 Jan 2023 - 1 20 15/04/2018 Fri 12 May 2023 - - >>> df.dtypes # doctest: +SKIP - col 1 int64 - col 2 object - col 3 object - dtype: object - - Specific columns can be parsed as dates by using the `parse_dates` and - `date_format` arguments. - - >>> df = pd.read_table( - ... "tmp.txt", - ... parse_dates=[1, 2], - ... date_format={"col 2": "%d/%m/%Y", "col 3": "%a %d %b %Y"}, - ... ) # doctest: +SKIP - - >>> df.dtypes # doctest: +SKIP - col 1 int64 - col 2 datetime64[ns] - col 3 datetime64[ns] - dtype: object - """ + :class:`DataFrame` + â–ª ``"pyarrow"``: returns pyarrow-backed nullable + + :class:`ArrowDtype` :class:`DataFrame` + + versionadded:: 2.0 + + Returns + ------- + + DataFrame or TextFileReader + A general delimited file is returned as two-dimensional + data structure with labeled axes. + + See Also + -------- + DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) + file. + read_csv : Read a comma-separated values (csv) file into DataFrame. + read_fwf : Read a table of fixed-width formatted lines into DataFrame. + + Examples + -------- + >>> pd.read_table("data.txt") # doctest: +SKIP + Name Value + 0 foo 1 + 1 bar 2 + 2 #baz 3 + + Index and header can be specified via the `index_col` and `header` arguments. + + >>> pd.read_table("data.txt", header=None) # doctest: +SKIP + 0 1 + 0 Name Value + 1 foo 1 + 2 bar 2 + 3 #baz 3 + + >>> pd.read_table("data.txt", index_col="Value") # doctest: +SKIP + Name Value + 1 foo + 2 bar + 3 #baz + + Column types are inferred but can be + explicitly specified using the dtype argument. + + >>> pd.read_table("data.txt", dtype={"Value": float}) # doctest: +SKIP + Name Value + 0 foo 1.0 + 1 bar 2.0 + 2 #baz 3.0 + + True, False, and NA values, and thousands separators have defaults, + but can be explicitly specified, too. Supply the values you would like + as strings or lists of strings! + + >>> pd.read_table("data.txt", na_values=["foo", "bar"]) # doctest: +SKIP + Name Value + 0 NaN 1 + 1 NaN 2 + 2 #baz 3 + + Comment lines in the input file can be skipped using the `comment` argument. + + >>> pd.read_table("data.txt", comment="#") # doctest: +SKIP + Name Value + 0 foo 1 + 1 bar 2 + + By default, columns with + dates will be read as ``object`` rather than ``datetime``. + + >>> df = pd.read_table("tmp.txt") # doctest: +SKIP + + >>> df # doctest: +SKIP + col 1 col 2 col 3 + 0 10 10/04/2018 Sun 15 Jan 2023 + 1 20 15/04/2018 Fri 12 May 2023 + + >>> df.dtypes # doctest: +SKIP + col 1 int64 + col 2 object + col 3 object + dtype: object + + Specific columns can be parsed as dates by using the `parse_dates` and + `date_format` arguments. + + >>> df = pd.read_table( + ... "tmp.txt", + ... parse_dates=[1, 2], + ... date_format={"col 2": "%d/%m/%Y", "col 3": "%a %d %b %Y"}, + ... ) # doctest: +SKIP + + >>> df.dtypes # doctest: +SKIP + col 1 int64 + col 2 datetime64[ns] + col 3 datetime64[ns] + dtype: object + """ @overload From f52c68da59791a24d1686af3d9c0ffd044a71484 Mon Sep 17 00:00:00 2001 From: huhu-dsy Date: Thu, 16 Oct 2025 21:02:29 +0800 Subject: [PATCH 10/24] Update readers.py --- pandas/io/parsers/readers.py | 1099 ++-------------------------------- 1 file changed, 52 insertions(+), 1047 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 2eb9253d9ad4d..8edb8798349c4 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -12,6 +12,7 @@ ) import csv import sys +from textwrap import fill from typing import ( IO, TYPE_CHECKING, @@ -35,6 +36,7 @@ ParserWarning, ) from pandas.util._decorators import ( + Appender, set_module, ) from pandas.util._exceptions import find_stack_level @@ -51,6 +53,7 @@ from pandas import Series from pandas.core.frame import DataFrame from pandas.core.indexes.api import RangeIndex +from pandas.core.shared_docs import _shared_docs from pandas.io.common import ( IOHandles, @@ -758,6 +761,21 @@ def read_csv( ) -> DataFrame | TextFileReader: ... +@Appender( + _doc_read_csv_and_table.format( + func_name="read_csv", + summary="Read a comma-separated values (csv) file into DataFrame.", + see_also_func_name="read_table", + see_also_func_summary="Read general delimited file into DataFrame.", + na_values_str=fill( + '", "'.join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" " + ), + _default_sep="','", + storage_options=_shared_docs["storage_options"], + decompression_options=_shared_docs["decompression_options"] + % "filepath_or_buffer", + ) +) @set_module("pandas") def read_csv( filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], @@ -816,534 +834,6 @@ def read_csv( storage_options: StorageOptions | None = None, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, ) -> DataFrame | TextFileReader: - r""" - Read a comma-separated values (csv) file into DataFrame. - - Also supports optionally iterating or breaking of the file - into chunks. - - Additional help can be found in the online docs for - `IO Tools `_. - - Parameters - ---------- - - filepath_or_buffer : str, path object or file-like object - Any valid string path is acceptable. The string could be a URL. Valid - URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is - expected. A local file could be: file://localhost/path/to/table.csv. - - If you want to pass in a path object, pandas accepts any ``os.PathLike``. - - By file-like object, we refer to objects with a ``read()`` method, such as - a file handle (e.g. via builtin ``open`` function) or ``StringIO``. - - sep : str, default ',' - Character or regex pattern to treat as the delimiter. If ``sep=None``, the - C engine cannot automatically detect - the separator, but the Python parsing engine can, meaning the latter will - be used and automatically detect the separator from only the first valid - row of the file by Python's builtin sniffer tool, ``csv.Sniffer``. - In addition, separators longer than 1 character and different from - ``'\s+'`` will be interpreted as regular expressions and will also force - the use of the Python parsing engine. Note that regex delimiters are prone - to ignoring quoted data. Regex example: ``'\r\t'``. - - delimiter : str, optional - Alias for ``sep``. - - header : int, Sequence of int, 'infer' or None, default 'infer' - Row number(s) containing column labels and marking the start of the - data (zero-indexed). Default behavior is to infer the column names: if no - ``names`` - are passed the behavior is identical to ``header=0`` and column - names are inferred from the first line of the file, if column - names are passed explicitly to ``names`` then the behavior is identical to - ``header=None``. Explicitly pass ``header=0`` to be able to - replace existing names. The header can be a list of integers that - specify row locations for a :class:`~pandas.MultiIndex` on the columns - e.g. ``[0, 1, 3]``. Intervening rows that are not specified will be - skipped (e.g. 2 in this example is skipped). Note that this - parameter ignores commented lines and empty lines if - ``skip_blank_lines=True``, so ``header=0`` denotes the first line of - data rather than the first line of the file. - - When inferred from the file contents, headers are kept distinct from - each other by renaming duplicate names with a numeric suffix of the form - ``".{count}"`` starting from 1, e.g. ``"foo"`` and ``"foo.1"``. - Empty headers are named - ``"Unnamed: {i}"`` or ``"Unnamed: {i}_level_{level}"`` - in the case of MultiIndex columns. - - names : Sequence of Hashable, optional - Sequence of column labels to apply. If the file contains a header row, - then you should explicitly pass ``header=0`` to override the column names. - Duplicates in this list are not allowed. - - index_col : Hashable, Sequence of Hashable or False, optional - Column(s) to use as row label(s), denoted either by column labels or column - indices. If a sequence of labels or indices is given, - :class:`~pandas.MultiIndex` - will be formed for the row labels. - - Note: ``index_col=False`` can be used to force pandas to *not* use the first - column as the index, e.g., when you have a malformed file with delimiters at - the end of each line. - - usecols : Sequence of Hashable or Callable, optional - Subset of columns to select, denoted either by column - labels or column indices. - If list-like, all elements must either - be positional (i.e. integer indices into the document columns) or strings - that correspond to column names provided either by the user in ``names`` or - inferred from the document header row(s). If ``names`` - are given, the document - header row(s) are not taken into account. For example, a valid list-like - ``usecols`` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``. - Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. - To instantiate a :class:`~pandas.DataFrame` from - ``data`` with element order - preserved use ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` - for columns in ``['foo', 'bar']`` order or - ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]`` - for ``['bar', 'foo']`` order. - - If callable, the callable function will be evaluated against the column - names, returning names where the callable function - evaluates to ``True``. An - example of a valid callable argument would be ``lambda x: x.upper() in - ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster - parsing time and lower memory usage. - - dtype : dtype or dict of {Hashable : dtype}, optional - Data type(s) to apply to either the whole dataset or individual columns. - E.g., ``{'a': np.float64, 'b': np.int32, 'c': 'Int64'}`` - Use ``str`` or ``object`` together with suitable ``na_values`` settings - to preserve and not interpret ``dtype``. - If ``converters`` are specified, they will be applied INSTEAD - of ``dtype`` conversion. - - versionadded:: 1.5.0 - Support for ``defaultdict`` was added. - Specify a ``defaultdict`` as input - where - the default determines the ``dtype`` of the columns - which are not explicitly - listed. - - engine : {'c', 'python', 'pyarrow'}, optional - Parser engine to use. The C and pyarrow engines are faster, - while the python engine - is currently more feature-complete. - Multithreading is currently only supported by - the pyarrow engine. - - versionadded:: 1.4.0 - The 'pyarrow' engine was added as an *experimental* engine, - and some features - are unsupported, or may not work correctly, with this engine. - - converters : dict of {Hashable : Callable}, optional - Functions for converting values in specified columns. Keys can either - be column labels or column indices. - - true_values : list, optional - Values to consider as ``True`` - in addition to case-insensitive variants of 'True'. - - false_values : list, optional - Values to consider as ``False`` - in addition to case-insensitive variants of 'False'. - - skipinitialspace : bool, default False - Skip spaces after delimiter. - - skiprows : int, list of int or Callable, optional - Line numbers to skip (0-indexed) or number of lines to skip (``int``) - at the start of the file. - - If callable, the callable function will be evaluated against the row - indices, returning ``True`` if the row - should be skipped and ``False`` otherwise. - An example of a valid callable argument would be - ``lambda x: x in [0, 2]``. - - skipfooter : int, default 0 - Number of lines at bottom of file to - skip (Unsupported with ``engine='c'``). - - nrows : int, optional - Number of rows of file to read. Useful for reading pieces of large files. - Refers to the number of data rows in the returned DataFrame, excluding: - - * The header row containing column names. - * Rows before the header row, if ``header=1`` or larger. - - Example usage: - - * To read the first 999,999 (non-header) rows: - ``read_csv(..., nrows=999999)`` - - * To read rows 1,000,000 through 1,999,999: - ``read_csv(..., skiprows=1000000, nrows=999999)`` - - na_values : Hashable, Iterable of Hashable or dict of {Hashable : Iterable}, - optional - Additional strings to recognize as - ``NA``/``NaN``. If ``dict`` passed, specific - per-column ``NA`` values. By default - the following values are interpreted as - ``NaN``: "", "#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN", - "-nan", "1.#IND", "1.#QNAN", "", "N/A", "NA", "NULL", "NaN", - "None", "n/a", "nan", "null". - - keep_default_na : bool, default True - Whether or not to include the default ``NaN`` values when - parsing the data. - Depending on whether ``na_values`` is - passed in, the behavior is as follows: - - * If ``keep_default_na`` is ``True``, and ``na_values`` are specified, - ``na_values`` - is appended to the default ``NaN`` values used for parsing. - * If ``keep_default_na`` is ``True``, and - ``na_values`` are not specified, only - the default ``NaN`` values are used for parsing. - * If ``keep_default_na`` is ``False`` - , and ``na_values`` are specified, only - the ``NaN`` values specified ``na_values`` are used for parsing. - * If ``keep_default_na`` is ``False``, and ``na_values`` - are not specified, no - strings will be parsed as ``NaN``. - - Note that if ``na_filter`` is passed in as ``False``, - the ``keep_default_na`` and - ``na_values`` parameters will be ignored. - - na_filter : bool, default True - Detect missing value markers (empty strings and - the value of ``na_values``). In - data without any ``NA`` values, passing ``na_filter=False`` - can improve the - performance of reading a large file. - - skip_blank_lines : bool, default True - If ``True``, skip over blank lines rather than - interpreting as ``NaN`` values. - - parse_dates : bool, None, list of Hashable, default None - The behavior is as follows: - - * ``bool``. If ``True`` -> try parsing the index. - * ``None``. Behaves like ``True`` if ``date_format`` is specified. - * ``list`` of ``int`` or names. e.g. If ``[1, 2, 3]`` -> try parsing - columns 1, 2, 3 - each as a separate date column. - - If a column or index cannot be represented as an array of ``datetime``, - say because of an unparsable value or a mixture of timezones, the column - or index will be returned unaltered as an ``object`` data type. For - non-standard ``datetime`` parsing, use :func:`~pandas.to_datetime` after - :func:`~pandas.read_csv`. - - Note: A fast-path exists for iso8601-formatted dates. - - date_format : str or dict of column -> format, optional - Format to use for parsing dates and/or times - when used in conjunction with ``parse_dates``. - The strftime to parse time, e.g. :const:`"%d/%m/%Y"`. See - `strftime documentation - `_ for more information on choices, though - note that :const:`"%f"` will parse all the way up to nanoseconds. - You can also pass: - - - "ISO8601", to parse any - `ISO8601 `_ - time string (not necessarily in exactly the same format); - - "mixed", to infer the format for each element - individually. This is risky, - and you should probably use it along with `dayfirst`. - - versionadded:: 2.0.0 - - dayfirst : bool, default False - DD/MM format dates, international and European format. - - cache_dates : bool, default True - If ``True``, use a cache of unique, converted - dates to apply the ``datetime`` - conversion. May produce significant speed-up when parsing duplicate - date strings, especially ones with timezone offsets. - - iterator : bool, default False - Return ``TextFileReader`` object for iteration or getting chunks with - ``get_chunk()``. - - chunksize : int, optional - Number of lines to read from the file - per chunk. Passing a value will cause the - function to return a ``TextFileReader`` object for iteration. - See the `IO Tools docs - `_ - for more information on ``iterator`` and ``chunksize``. - - compression : str or dict, default 'infer' - For on-the-fly decompression of on-disk data. If 'infer' and - 'filepath_or_buffer' is - path-like, then detect compression from the following extensions: '.gz', - '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or ' - .tar.bz2' (otherwise no compression). - If using 'zip' or 'tar', the ZIP file must contain only one data - file to be read in. - Set to ``None`` for no decompression. - Can also be a dict with key ``'method'`` set to one of {``'zip'``, - ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``} and other - key-value pairs are forwarded to ``zipfile.ZipFile``, - ``gzip.GzipFile``, ``bz2.BZ2File``, ``zstandard.ZstdDecompressor``, - ``lzma.LZMAFile`` or ``tarfile.TarFile``, respectively. - As an example, the following could be passed for Zstandard - decompression using a custom compression dictionary: - ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``. - - versionadded:: 1.5.0 - Added support for `.tar` files. - - versionchanged:: 1.4.0 Zstandard support. - - thousands : str (length 1), optional - Character acting as the thousands separator in numerical values. - - decimal : str (length 1), default '.' - Character to recognize as decimal point (e.g., use ',' for European - data). - - lineterminator : str (length 1), optional - Character used to denote a line break. Only valid with C parser. - - quotechar : str (length 1), optional - Character used to denote the start and end of a quoted item. Quoted - items can include the ``delimiter`` and it will be ignored. - - quoting : {0 or csv.QUOTE_MINIMAL, 1 or csv.QUOTE_ALL, 2 or - csv.QUOTE_NONNUMERIC, 3 or csv.QUOTE_NONE}, default - csv.QUOTE_MINIMAL - Control field quoting behavior per ``csv.QUOTE_*`` constants. - Default is ``csv.QUOTE_MINIMAL`` (i.e., 0) which implies that only - fields containing special characters are quoted (e.g., characters - defined in ``quotechar``, ``delimiter``, or ``lineterminator``. - - doublequote : bool, default True - When ``quotechar`` is specified and ``quoting`` is not - ``QUOTE_NONE``, indicate whether or not to interpret two - consecutive ``quotechar`` elements INSIDE a field as a single - ``quotechar`` element. - - escapechar : str (length 1), optional - Character used to escape other characters. - - comment : str (length 1), optional - Character indicating that the remainder of line should not be - parsed. If found at the beginning of a line, the line will be - ignored altogether. This parameter must be a single character. - Like empty lines (as long as ``skip_blank_lines=True``), fully - commented lines are ignored by the parameter ``header`` but not by - ``skiprows``. For example, if ``comment='#'``, parsing - ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in - ``'a,b,c'`` being treated as the header. - - encoding : str, optional, default 'utf-8' - Encoding to use for UTF when reading/writing (ex. ``'utf-8'``). - `List of Python standard encodings - `_. - - encoding_errors : str, optional, default 'strict' - How encoding errors are treated. `List of possible values - `_. - - versionadded:: 1.3.0 - - dialect : str or csv.Dialect, optional - If provided, this parameter will override values (default or not) - for the following parameters: ``delimiter``, ``doublequote``, - ``escapechar``, ``skipinitialspace``, ``quotechar``, and - ``quoting``. If it is necessary to override values, a - ``ParserWarning`` will be issued. See ``csv.Dialect`` - documentation for more details. - - on_bad_lines : {'error', 'warn', 'skip'} or Callable, default 'error' - Specifies what to do upon encountering a bad line (a line with too - many fields). Allowed values are: - - - ``'error'``, raise an Exception when a bad line is encountered. - - ``'warn'``, raise a warning when a bad line is encountered and - skip that line. - - ``'skip'``, skip bad lines without raising or warning when they - are encountered. - - Callable, function that will process a single bad line. - - With ``engine='python'``, function with signature - ``(bad_line: list[str]) -> list[str] | None``. - ``bad_line`` is a list of strings split by the ``sep``. - If the function returns ``None``, the bad line will be - ignored. - If the function returns a new ``list`` of strings with more - elements than expected, a ``ParserWarning`` will be emitted - while dropping extra elements. - - With ``engine='pyarrow'``, function with signature as - described in pyarrow documentation: `invalid_row_handler - #pyarrow.csv.ParseOptions.invalid_row_handler>`_. - - versionadded:: 1.3.0 - - versionadded:: 1.4.0 - Callable - - versionchanged:: 2.2.0 - Callable for ``engine='pyarrow'`` - - low_memory : bool, default True - Internally process the file in chunks, resulting in lower memory - use while parsing, but possibly mixed type inference. To ensure - no mixed types either set ``False``, or specify the type with the - ``dtype`` parameter. Note that the entire file is read into a - single :class:`~pandas.DataFrame` regardless, use the - ``chunksize`` or ``iterator`` parameter to return the data in - chunks. (Only valid with C parser). - - memory_map : bool, default False - If a filepath is provided for ``filepath_or_buffer``, map the file - object directly onto memory and access the data directly from - there. Using this option can improve performance because there is - no longer any I/O overhead. - - float_precision : {'high', 'legacy', 'round_trip'}, optional - Specifies which converter the C engine should use for - floating-point values. The options are ``None`` or ``'high'`` for - the ordinary converter, ``'legacy'`` for the original lower - precision pandas converter, and ``'round_trip'`` for the - round-trip converter. - - storage_options : dict, optional - Extra options that make sense for a particular storage connection, - e.g. host, port, username, password, etc. For HTTP(S) URLs the - key-value pairs are forwarded to ``urllib.request.Request`` as - header options. For other URLs (e.g. starting with "s3://", and - "gcs://") the key-value pairs are forwarded to ``fsspec.open``. - Please see ``fsspec`` and ``urllib`` for more details, and for - more examples on storage options refer `here - `_. - - dtype_backend : {'numpy_nullable', 'pyarrow'} - Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). If not specified, the default behavior is - to not use nullable data types. If specified, the behavior is as - follows: - - * ``"numpy_nullable"``: returns nullable-dtype-backed - :class:`DataFrame` - * ``"pyarrow"``: returns pyarrow-backed nullable - :class:`ArrowDtype` :class:`DataFrame` - - versionadded:: 2.0 - - Returns - ------- - - DataFrame or TextFileReader - A comma-separated values (csv) file is returned as two-dimensional - data structure with labeled axes. - - See Also - -------- - DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) - file. - read_table : Read general delimited file into DataFrame. - read_fwf : Read a table of fixed-width formatted lines into DataFrame. - - Examples - -------- - >>> pd.read_csv("data.csv") # doctest: +SKIP - Name Value - 0 foo 1 - 1 bar 2 - 2 #baz 3 - - Index and header can be specified via the `index_col` and `header` - arguments. - - >>> pd.read_csv("data.csv", header=None) # doctest: +SKIP - 0 1 - 0 Name Value - 1 foo 1 - 2 bar 2 - 3 #baz 3 - - >>> pd.read_csv("data.csv", index_col="Value") # doctest: +SKIP - Name - Value - 1 foo - 2 bar - 3 #baz - - Column types are inferred but can be explicitly specified using the - `dtype` argument. - - >>> pd.read_csv("data.csv", dtype={"Value": float}) # doctest: +SKIP - Name Value - 0 foo 1.0 - 1 bar 2.0 - 2 #baz 3.0 - - True, False, and NA values, and thousands separators have defaults, - but can be explicitly specified, too. Supply the values you would like - as strings or lists of strings! - - >>> pd.read_csv("data.csv", na_values=["foo", "bar"]) # doctest: +SKIP - Name Value - 0 NaN 1 - 1 NaN 2 - 2 #baz 3 - - Comment lines in the input file can be skipped using the `comment` - argument. - - >>> pd.read_csv("data.csv", comment="#") # doctest: +SKIP - Name Value - 0 foo 1 - 1 bar 2 - - By default, columns with dates will be read as ``object`` rather than - ``datetime``. - - >>> df = pd.read_csv("tmp.csv") # doctest: +SKIP - - >>> df # doctest: +SKIP - col 1 col 2 col 3 - 0 10 10/04/2018 Sun 15 Jan 2023 - 1 20 15/04/2018 Fri 12 May 2023 - - >>> df.dtypes # doctest: +SKIP - col 1 int64 - col 2 object - col 3 object - dtype: object - - Specific columns can be parsed as dates by using the `parse_dates` and - `date_format` arguments. - - >>> df = pd.read_csv( - "tmp.csv", - parse_dates=[1, 2], - date_format={"col 2": "%d/%m/%Y", "col 3": "%a %d %b %Y"}, - ) # doctest: +SKIP - - >>> df.dtypes # doctest: +SKIP - col 1 int64 - col 2 datetime64[ns] - col 3 datetime64[ns] - dtype: object - """ # locals() should never be modified kwds = locals().copy() del kwds["filepath_or_buffer"] @@ -1404,6 +894,23 @@ def read_table( ) -> DataFrame | TextFileReader: ... +@Appender( + _doc_read_csv_and_table.format( + func_name="read_table", + summary="Read general delimited file into DataFrame.", + see_also_func_name="read_csv", + see_also_func_summary=( + "Read a comma-separated values (csv) file into DataFrame." + ), + na_values_str=fill( + '", "'.join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" " + ), + _default_sep=r"'\\t' (tab-stop)", + storage_options=_shared_docs["storage_options"], + decompression_options=_shared_docs["decompression_options"] + % "filepath_or_buffer", + ) +) @set_module("pandas") def read_table( filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], @@ -1462,526 +969,24 @@ def read_table( storage_options: StorageOptions | None = None, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, ) -> DataFrame | TextFileReader: - """ - Read general delimited file into DataFrame. - - Also supports optionally iterating or breaking of the file - into chunks. - - Additional help can be found in the online docs for - `IO Tools `_. - - Parameters - ---------- - - filepath_or_buffer : str, path object or file-like object - Any valid string path is acceptable. The string could be a URL. Valid - URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is - expected. A local file could be: file://localhost/path/to/table.csv. - - If you want to pass in a path object, pandas accepts any ``os.PathLike``. - - By file-like object, we refer to objects with a ``read()`` method, such as - a file handle (e.g. via builtin ``open`` function) or ``StringIO``. - - sep : str, default '\\t' (tab-stop) - Character or regex pattern to treat as the delimiter. If ``sep=None``, the - C engine cannot automatically detect - the separator, but the Python parsing engine can, meaning the latter will - be used and automatically detect the separator from only the first valid - row of the file by Python's builtin sniffer tool, ``csv.Sniffer``. - - In addition, separators longer than 1 character and different from - ``'\\s+'`` will be interpreted as regular expressions and will also force - the use of the Python parsing engine. Note that regex delimiters are prone - to ignoring quoted data. Regex example: ``'\r\t'``. - - delimiter : str, optional - Alias for ``sep``. - - header : int, Sequence of int, 'infer' or None, default 'infer' - Row number(s) containing column labels and marking the start of the - data (zero-indexed). Default behavior is to infer the column names: - if no ``names`` are passed the behavior is identical to ``header=0`` - and column names are inferred from the first line of the file, if - column names are passed explicitly to ``names`` then the behavior is - identical to ``header=None``. Explicitly pass ``header=0`` to be - able to replace existing names. The header can be a list of integers - that specify row locations for a :class:`~pandas.MultiIndex` on the - columns e.g. ``[0, 1, 3]``. Intervening rows that are not specified - will be skipped (e.g. 2 in this example is skipped). Note that this - parameter ignores commented lines and empty lines if - ``skip_blank_lines=True``, so ``header=0`` denotes the first line of - data rather than the first line of the file. - - When inferred from the file contents, headers are kept distinct from - each other by renaming duplicate names with a numeric suffix of the - form ``".{count}"`` starting from 1, e.g. ``"foo"`` and ``"foo.1"``. - Empty headers are named ``"Unnamed: {i}"`` or - ``"Unnamed: {i}_level_{level}"`` in the case of MultiIndex columns. - - names : Sequence of Hashable, optional - Sequence of column labels to apply. If the file contains a header row, - then you should explicitly pass ``header=0`` to override the column - names. Duplicates in this list are not allowed. - - index_col : Hashable, Sequence of Hashable or False, optional - Column(s) to use as row label(s), denoted either by column labels or - column indices. If a sequence of labels or indices is given, - :class:`~pandas.MultiIndex` will be formed for the row labels. - - Note: ``index_col=False`` can be used to force pandas to *not* use - the first column as the index, e.g., when you have a malformed file - with delimiters at the end of each line. - - usecols : Sequence of Hashable or Callable, optional - Subset of columns to select, denoted either by column labels or - column indices. If list-like, all elements must either - be positional (i.e. integer indices into the document columns) or - strings that correspond to column names provided either by the user - in ``names`` or inferred from the document header row(s). If - ``names`` are given, the document header row(s) are not taken into - account. For example, a valid list-like ``usecols`` parameter would - be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``. Element order is - ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. To - instantiate a :class:`~pandas.DataFrame` from ``data`` with element - order preserved use - ``pd.read_table(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` - for columns in ``['foo', 'bar']`` order or - ``pd.read_table(data, usecols=['foo', 'bar'])[['bar', 'foo']]`` - for ``['bar', 'foo']`` order. - - If callable, the callable function will be evaluated against the - column names, returning names where the callable function evaluates - to ``True``. An example of a valid callable argument would be - ``lambda x: x.upper() in ['AAA', 'BBB', 'DDD']``. Using this - parameter results in much faster parsing time and lower memory usage. - - dtype : dtype or dict of {Hashable : dtype}, optional - Data type(s) to apply to either the whole dataset or individual - columns. E.g., ``{'a': np.float64, 'b': np.int32, 'c': 'Int64'}`` - Use ``str`` or ``object`` together with suitable ``na_values`` - settings to preserve and not interpret ``dtype``. - If ``converters`` are specified, they will be applied INSTEAD - of ``dtype`` conversion. - - versionadded:: 1.5.0 - Support for ``defaultdict`` was added. Specify a ``defaultdict`` - as input where the default determines the ``dtype`` of the - columns which are not explicitly listed. - - engine : {'c', 'python', 'pyarrow'}, optional - Parser engine to use. The C and pyarrow engines are faster, while - the python engine is currently more feature-complete. Multithreading - is currently only supported by the pyarrow engine. - - versionadded:: 1.4.0 - The 'pyarrow' engine was added as an *experimental* engine, and - some features are unsupported, or may not work correctly, with - this engine. - - converters : dict of {Hashable : Callable}, optional - Functions for converting values in specified columns. Keys can either - be column labels or column indices. - - true_values : list, optional - Values to consider as ``True`` in addition to case-insensitive - variants of 'True'. - - false_values : list, optional - Values to consider as ``False`` in addition to case-insensitive - variants of 'False'. - - skipinitialspace : bool, default False - Skip spaces after delimiter. - - skiprows : int, list of int or Callable, optional - Line numbers to skip (0-indexed) or number of lines to skip (``int``) - at the start of the file. - - If callable, the callable function will be evaluated against the row - indices, returning ``True`` if the row should be skipped and ``False`` - otherwise. An example of a valid callable argument would be - ``lambda x: x in [0, 2]``. - - skipfooter : int, default 0 - Number of lines at bottom of file to skip (Unsupported with - ``engine='c'``). - - nrows : int, optional - Number of rows of file to read. Useful for reading pieces of large - files. Refers to the number of data rows in the returned DataFrame, - excluding: - - â–ª The header row containing column names. - - â–ª Rows before the header row, if ``header=1`` or larger. - - - Example usage: - - â–ª To read the first 999,999 (non-header) rows: - - ``read_table(..., nrows=999999)`` - - â–ª To read rows 1,000,000 through 1,999,999: - - ``read_table(..., skiprows=1000000, nrows=999999)`` - - na_values : Hashable, Iterable of Hashable or dict of {Hashable : - Iterable}, optional - Additional strings to recognize as ``NA``/``NaN``. If ``dict`` - passed, specific per-column ``NA`` values. By default the following - values are interpreted as ``NaN``: "", "#N/A", "#N/A N/A", "#NA", - "-1.#IND", "-1.#QNAN", "-NaN", "-nan", "1.#IND", "1.#QNAN", "", - "N/A", "NA", "NULL", "NaN", "None", "n/a", "nan", "null". - - keep_default_na : bool, default True - Whether or not to include the default ``NaN`` values when parsing - the data. Depending on whether ``na_values`` is passed in, the - behavior is as follows: - - â–ª If ``keep_default_na`` is ``True``, and ``na_values`` are - - specified, ``na_values`` is appended to the default ``NaN`` values - used for parsing. - â–ª If ``keep_default_na`` is ``True``, and ``na_values`` are not - - specified, only the default ``NaN`` values are used for parsing. - â–ª If ``keep_default_na`` is ``False``, and ``na_values`` are - - specified, only the ``NaN`` values specified ``na_values`` are - used for parsing. - â–ª If ``keep_default_na`` is ``False``, and ``na_values`` are not - - specified, no strings will be parsed as ``NaN``. - - Note that if ``na_filter`` is passed in as ``False``, the - ``keep_default_na`` and ``na_values`` parameters will be ignored. - - na_filter : bool, default True - Detect missing value markers (empty strings and the value of - ``na_values``). In data without any ``NA`` values, passing - ``na_filter=False`` can improve the performance of reading a large - file. - - skip_blank_lines : bool, default True - If ``True``, skip over blank lines rather than interpreting as - ``NaN`` values. - - parse_dates : bool, None, list of Hashable, default None - The behavior is as follows: - - â–ª ``bool``. If ``True`` -> try parsing the index. - - â–ª ``None``. Behaves like ``True`` if ``date_format`` is specified. - - â–ª ``list`` of ``int`` or names. e.g. If ``[1, 2, 3]`` -> try - - parsing columns 1, 2, 3 each as a separate date column. - - If a column or index cannot be represented as an array of - ``datetime``, say because of an unparsable value or a mixture of - timezones, the column or index will be returned unaltered as an - ``object`` data type. For non-standard ``datetime`` parsing, use - :func:`~pandas.to_datetime` after :func:`~pandas.read_table`. - - Note: A fast-path exists for iso8601-formatted dates. - - date_format : str or dict of column -> format, optional - Format to use for parsing dates and/or times when used in conjunction - with ``parse_dates``. The strftime to parse time, e.g. - :const:`"%d/%m/%Y"`. See `strftime documentation - `_ for more information on choices, - though note that :const:`"%f"`` will parse all the way up to - nanoseconds. You can also pass: - - â–ª "ISO8601", to parse any `ISO8601 - - `_ time string (not - necessarily in exactly the same format); - â–ª "mixed", to infer the format for each element individually. This - - is risky, and you should probably use it along with `dayfirst`. - - versionadded:: 2.0.0 - - dayfirst : bool, default False - DD/MM format dates, international and European format. - - cache_dates : bool, default True - If ``True``, use a cache of unique, converted dates to apply the - ``datetime`` conversion. May produce significant speed-up when - parsing duplicate date strings, especially ones with timezone - offsets. - - iterator : bool, default False - Return ``TextFileReader`` object for iteration or getting chunks - with ``get_chunk()``. - - chunksize : int, optional - Number of lines to read from the file per chunk. Passing a value - will cause the function to return a ``TextFileReader`` object for - iteration. See the `IO Tools docs - `_ - for more information on ``iterator`` and ``chunksize``. - - compression : str or dict, default 'infer' - For on-the-fly decompression of on-disk data. If 'infer' and - 'filepath_or_buffer' is path-like, then detect compression from the - following extensions: '.gz', '.bz2', '.zip', '.xz', '.zst', '.tar', - '.tar.gz', '.tar.xz' or '.tar.bz2' (otherwise no compression). - If using 'zip' or 'tar', the ZIP file must contain only one data - file to be read in. Set to ``None`` for no decompression. - Can also be a dict with key ``'method'`` set to one of - {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``} - and other key-value pairs are forwarded to ``zipfile.ZipFile``, - ``gzip.GzipFile``, ``bz2.BZ2File``, ``zstandard.ZstdDecompressor``, - ``lzma.LZMAFile`` or ``tarfile.TarFile``, respectively. - As an example, the following could be passed for Zstandard - decompression using a custom compression dictionary: - ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``. - - versionadded:: 1.5.0 - Added support for `.tar` files. - - versionchanged:: 1.4.0 Zstandard support. - - thousands : str (length 1), optional - Character acting as the thousands separator in numerical values. - - decimal : str (length 1), default '.' - Character to recognize as decimal point (e.g., use ',' for European - data). - - lineterminator : str (length 1), optional - Character used to denote a line break. Only valid with C parser. - - quotechar : str (length 1), default '"' - Character used to denote the start and end of a quoted item. Quoted - items can include the ``delimiter`` and it will be ignored. - - quoting : {0 or csv.QUOTE_MINIMAL, 1 or csv.QUOTE_ALL, - 2 or csv.QUOTE_NONNUMERIC, 3 or csv.QUOTE_NONE}, default - csv.QUOTE_MINIMAL - Control field quoting behavior per ``csv.QUOTE_*`` constants. - Default is ``csv.QUOTE_MINIMAL`` (i.e., 0) which implies that only - fields containing special characters are quoted (e.g., characters - defined in ``quotechar``, ``delimiter``, or ``lineterminator``. - - doublequote : bool, default True - When ``quotechar`` is specified and ``quoting`` is not ``QUOTE_NONE``, - indicate whether or not to interpret two consecutive ``quotechar`` - elements INSIDE a field as a single ``quotechar`` element. - - escapechar : str (length 1), optional - Character used to escape other characters. - - comment : str (length 1), optional - Character indicating that the remainder of line should not be - parsed. If found at the beginning of a line, the line will be - ignored altogether. This parameter must be a single character. Like - empty lines (as long as ``skip_blank_lines=True``), fully commented - lines are ignored by the parameter ``header`` but not by - ``skiprows``. For example, if ``comment='#'``, parsing - ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in - ``'a,b,c'`` being treated as the header. - - encoding : str, optional, default 'utf-8' - Encoding to use for UTF when reading/writing (ex. ``'utf-8'``). - `List of Python standard encodings - `_. - - encoding_errors : str, optional, default 'strict' - How encoding errors are treated. `List of possible values - `_. - - versionadded:: 1.3.0 - - dialect : str or csv.Dialect, optional - If provided, this parameter will override values (default or not) - for the following parameters: ``delimiter``, ``doublequote``, - ``escapechar``, ``skipinitialspace``, ``quotechar``, and ``quoting``. - If it is necessary to override values, a ``ParserWarning`` will be - issued. See ``csv.Dialect`` documentation for more details. - - on_bad_lines : {'error', 'warn', 'skip'} or Callable, default 'error' - Specifies what to do upon encountering a bad line (a line with too - many fields). Allowed values are: - - â–ª ``'error'``, raise an Exception when a bad line is encountered. - - â–ª ``'warn'``, raise a warning when a bad line is encountered and - - skip that line. - â–ª ``'skip'``, skip bad lines without raising or warning when they - - are encountered. - â–ª Callable, function that will process a single bad line. - - â–ª With ``engine='python'``, function with signature - - ``(bad_line: list[str]) -> list[str] | None``. - ``bad_line`` is a list of strings split by the ``sep``. - If the function returns ``None``, the bad line will be ignored. - If the function returns a new ``list`` of strings with more - elements than expected, a ``ParserWarning`` will be emitted - while dropping extra elements. - â–ª With ``engine='pyarrow'``, function with signature - - as described in pyarrow documentation: `invalid_row_handler - #pyarrow.csv.ParseOptions.invalid_row_handler>`_. - - versionadded:: 1.3.0 - - versionadded:: 1.4.0 - Callable - - versionchanged:: 2.2.0 - Callable for ``engine='pyarrow'`` - - low_memory : bool, default True - Internally process the file in chunks, resulting in lower memory use - while parsing, but possibly mixed type inference. To ensure no - mixed types either set ``False``, or specify the type with the - ``dtype`` parameter. Note that the entire file is read into a single - :class:`~pandas.DataFrame` regardless, use the ``chunksize`` or - ``iterator`` parameter to return the data in chunks. (Only valid - with C parser). - - memory_map : bool, default False - If a filepath is provided for ``filepath_or_buffer``, map the file - object directly onto memory and access the data directly from there. - Using this option can improve performance because there is no longer - any I/O overhead. - - float_precision : {'high', 'legacy', 'round_trip'}, optional - Specifies which converter the C engine should use for floating-point - values. The options are ``None`` or ``'high'`` for the ordinary - converter, ``'legacy'`` for the original lower precision pandas - converter, and ``'round_trip'`` for the round-trip converter. - - storage_options : dict, optional - Extra options that make sense for a particular storage connection, - e.g. host, port, username, password, etc. For HTTP(S) URLs the - key-value pairs are forwarded to ``urllib.request.Request`` as - header options. For other URLs (e.g. starting with "s3://", and - "gcs://") the key-value pairs are forwarded to ``fsspec.open``. - Please see ``fsspec`` and ``urllib`` for more details, and for more - examples on storage options refer `here - `_. - - dtype_backend : {'numpy_nullable', 'pyarrow'} - Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). If not specified, the default behavior - is to not use nullable data types. If specified, the behavior - is as follows: - - â–ª ``"numpy_nullable"``: returns nullable-dtype-backed - - :class:`DataFrame` - â–ª ``"pyarrow"``: returns pyarrow-backed nullable - - :class:`ArrowDtype` :class:`DataFrame` - - versionadded:: 2.0 - - Returns - ------- - - DataFrame or TextFileReader - A general delimited file is returned as two-dimensional - data structure with labeled axes. + # locals() should never be modified + kwds = locals().copy() + del kwds["filepath_or_buffer"] + del kwds["sep"] - See Also - -------- - DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) - file. - read_csv : Read a comma-separated values (csv) file into DataFrame. - read_fwf : Read a table of fixed-width formatted lines into DataFrame. + kwds_defaults = _refine_defaults_read( + dialect, + delimiter, + engine, + sep, + on_bad_lines, + names, + defaults={"delimiter": "\t"}, + dtype_backend=dtype_backend, + ) + kwds.update(kwds_defaults) - Examples - -------- - >>> pd.read_table("data.txt") # doctest: +SKIP - Name Value - 0 foo 1 - 1 bar 2 - 2 #baz 3 - - Index and header can be specified via the `index_col` and `header` arguments. - - >>> pd.read_table("data.txt", header=None) # doctest: +SKIP - 0 1 - 0 Name Value - 1 foo 1 - 2 bar 2 - 3 #baz 3 - - >>> pd.read_table("data.txt", index_col="Value") # doctest: +SKIP - Name Value - 1 foo - 2 bar - 3 #baz - - Column types are inferred but can be - explicitly specified using the dtype argument. - - >>> pd.read_table("data.txt", dtype={"Value": float}) # doctest: +SKIP - Name Value - 0 foo 1.0 - 1 bar 2.0 - 2 #baz 3.0 - - True, False, and NA values, and thousands separators have defaults, - but can be explicitly specified, too. Supply the values you would like - as strings or lists of strings! - - >>> pd.read_table("data.txt", na_values=["foo", "bar"]) # doctest: +SKIP - Name Value - 0 NaN 1 - 1 NaN 2 - 2 #baz 3 - - Comment lines in the input file can be skipped using the `comment` argument. - - >>> pd.read_table("data.txt", comment="#") # doctest: +SKIP - Name Value - 0 foo 1 - 1 bar 2 - - By default, columns with - dates will be read as ``object`` rather than ``datetime``. - - >>> df = pd.read_table("tmp.txt") # doctest: +SKIP - - >>> df # doctest: +SKIP - col 1 col 2 col 3 - 0 10 10/04/2018 Sun 15 Jan 2023 - 1 20 15/04/2018 Fri 12 May 2023 - - >>> df.dtypes # doctest: +SKIP - col 1 int64 - col 2 object - col 3 object - dtype: object - - Specific columns can be parsed as dates by using the `parse_dates` and - `date_format` arguments. - - >>> df = pd.read_table( - ... "tmp.txt", - ... parse_dates=[1, 2], - ... date_format={"col 2": "%d/%m/%Y", "col 3": "%a %d %b %Y"}, - ... ) # doctest: +SKIP - - >>> df.dtypes # doctest: +SKIP - col 1 int64 - col 2 datetime64[ns] - col 3 datetime64[ns] - dtype: object - """ + return _read(filepath_or_buffer, kwds) @overload @@ -2934,4 +1939,4 @@ def _validate_skipfooter(kwds: dict[str, Any]) -> None: if kwds.get("iterator") or kwds.get("chunksize"): raise ValueError("'skipfooter' not supported for iteration") if kwds.get("nrows"): - raise ValueError("'skipfooter' not supported with 'nrows'") + raise ValueError("'skipfooter' not supported with 'nrows'") \ No newline at end of file From 67da165ecd5f85f9d5e02bf13cd271fda8f98bbe Mon Sep 17 00:00:00 2001 From: huhu-dsy Date: Thu, 16 Oct 2025 22:44:20 +0800 Subject: [PATCH 11/24] refactor: replace @Appender with hardcoded docstrings for read_csv and read_table --- pandas/io/parsers/readers.py | 905 +++++++++++++++++++++++++++++++++-- 1 file changed, 869 insertions(+), 36 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 8edb8798349c4..5dda6dddf15ec 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -12,7 +12,6 @@ ) import csv import sys -from textwrap import fill from typing import ( IO, TYPE_CHECKING, @@ -36,7 +35,6 @@ ParserWarning, ) from pandas.util._decorators import ( - Appender, set_module, ) from pandas.util._exceptions import find_stack_level @@ -53,7 +51,6 @@ from pandas import Series from pandas.core.frame import DataFrame from pandas.core.indexes.api import RangeIndex -from pandas.core.shared_docs import _shared_docs from pandas.io.common import ( IOHandles, @@ -761,21 +758,6 @@ def read_csv( ) -> DataFrame | TextFileReader: ... -@Appender( - _doc_read_csv_and_table.format( - func_name="read_csv", - summary="Read a comma-separated values (csv) file into DataFrame.", - see_also_func_name="read_table", - see_also_func_summary="Read general delimited file into DataFrame.", - na_values_str=fill( - '", "'.join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" " - ), - _default_sep="','", - storage_options=_shared_docs["storage_options"], - decompression_options=_shared_docs["decompression_options"] - % "filepath_or_buffer", - ) -) @set_module("pandas") def read_csv( filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], @@ -834,6 +816,440 @@ def read_csv( storage_options: StorageOptions | None = None, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, ) -> DataFrame | TextFileReader: + """ + Read a comma-separated values (csv) file into DataFrame. + + Also supports optionally iterating or breaking of the file + into chunks. + + Additional help can be found in the online docs for + `IO Tools `_. + + Parameters + ---------- + filepath_or_buffer : str, path object or file-like object + Any valid string path is acceptable. The string could be a URL. Valid + URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is + expected. A local file could be: file://localhost/path/to/table.csv. + + If you want to pass in a path object, pandas accepts any ``os.PathLike``. + + By file-like object, we refer to objects with a ``read()`` method, such as + a file handle (e.g. via builtin ``open`` function) or ``StringIO``. + sep : str, default ',' + Character or regex pattern to treat as the delimiter. If ``sep=None``, the + C engine cannot automatically detect + the separator, but the Python parsing engine can, meaning the latter will + be used and automatically detect the separator from only the first valid + row of the file by Python's builtin sniffer tool, ``csv.Sniffer``. + In addition, separators longer than 1 character and different from + ``'\\s+'`` will be interpreted as regular expressions and will also force + the use of the Python parsing engine. Note that regex delimiters are prone + to ignoring quoted data. Regex example: ``'\r\t'``. + delimiter : str, optional + Alias for ``sep``. + header : int, Sequence of int, 'infer' or None, default 'infer' + Row number(s) containing column labels and marking the start of the + data (zero-indexed). Default behavior is to infer the column names: if no ``names`` + are passed the behavior is identical to ``header=0`` and column + names are inferred from the first line of the file, if column + names are passed explicitly to ``names`` then the behavior is identical to + ``header=None``. Explicitly pass ``header=0`` to be able to + replace existing names. The header can be a list of integers that + specify row locations for a :class:`~pandas.MultiIndex` on the columns + e.g. ``[0, 1, 3]``. Intervening rows that are not specified will be + skipped (e.g. 2 in this example is skipped). Note that this + parameter ignores commented lines and empty lines if + ``skip_blank_lines=True``, so ``header=0`` denotes the first line of + data rather than the first line of the file. + + When inferred from the file contents, headers are kept distinct from + each other by renaming duplicate names with a numeric suffix of the form + ``".{{count}}"`` starting from 1, e.g. ``"foo"`` and ``"foo.1"``. + Empty headers are named ``"Unnamed: {{i}}"`` or ``"Unnamed: {{i}}_level_{{level}}"`` + in the case of MultiIndex columns. + names : Sequence of Hashable, optional + Sequence of column labels to apply. If the file contains a header row, + then you should explicitly pass ``header=0`` to override the column names. + Duplicates in this list are not allowed. + index_col : Hashable, Sequence of Hashable or False, optional + Column(s) to use as row label(s), denoted either by column labels or column + indices. If a sequence of labels or indices is given, :class:`~pandas.MultiIndex` + will be formed for the row labels. + + Note: ``index_col=False`` can be used to force pandas to *not* use the first + column as the index, e.g., when you have a malformed file with delimiters at + the end of each line. + usecols : Sequence of Hashable or Callable, optional + Subset of columns to select, denoted either by column labels or column indices. + If list-like, all elements must either + be positional (i.e. integer indices into the document columns) or strings + that correspond to column names provided either by the user in ``names`` or + inferred from the document header row(s). If ``names`` are given, the document + header row(s) are not taken into account. For example, a valid list-like + ``usecols`` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``. + Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. + To instantiate a :class:`~pandas.DataFrame` from ``data`` with element order + preserved use ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` + for columns in ``['foo', 'bar']`` order or + ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]`` + for ``['bar', 'foo']`` order. + + If callable, the callable function will be evaluated against the column + names, returning names where the callable function evaluates to ``True``. An + example of a valid callable argument would be ``lambda x: x.upper() in + ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster + parsing time and lower memory usage. + dtype : dtype or dict of {{Hashable : dtype}}, optional + Data type(s) to apply to either the whole dataset or individual columns. + E.g., ``{{'a': np.float64, 'b': np.int32, 'c': 'Int64'}}`` + Use ``str`` or ``object`` together with suitable ``na_values`` settings + to preserve and not interpret ``dtype``. + If ``converters`` are specified, they will be applied INSTEAD + of ``dtype`` conversion. + + .. versionadded:: 1.5.0 + + Support for ``defaultdict`` was added. Specify a ``defaultdict`` as input where + the default determines the ``dtype`` of the columns which are not explicitly + listed. + engine : {{'c', 'python', 'pyarrow'}}, optional + Parser engine to use. The C and pyarrow engines are faster, while the python engine + is currently more feature-complete. Multithreading is currently only supported by + the pyarrow engine. + + .. versionadded:: 1.4.0 + + The 'pyarrow' engine was added as an *experimental* engine, and some features + are unsupported, or may not work correctly, with this engine. + converters : dict of {{Hashable : Callable}}, optional + Functions for converting values in specified columns. Keys can either + be column labels or column indices. + true_values : list, optional + Values to consider as ``True`` in addition to case-insensitive variants of 'True'. + false_values : list, optional + Values to consider as ``False`` in addition to case-insensitive variants of 'False'. + skipinitialspace : bool, default False + Skip spaces after delimiter. + skiprows : int, list of int or Callable, optional + Line numbers to skip (0-indexed) or number of lines to skip (``int``) + at the start of the file. + + If callable, the callable function will be evaluated against the row + indices, returning ``True`` if the row should be skipped and ``False`` otherwise. + An example of a valid callable argument would be ``lambda x: x in [0, 2]``. + skipfooter : int, default 0 + Number of lines at bottom of file to skip (Unsupported with ``engine='c'``). + nrows : int, optional + Number of rows of file to read. Useful for reading pieces of large files. + Refers to the number of data rows in the returned DataFrame, excluding: + + * The header row containing column names. + * Rows before the header row, if ``header=1`` or larger. + + Example usage: + + * To read the first 999,999 (non-header) rows: + ``read_csv(..., nrows=999999)`` + + * To read rows 1,000,000 through 1,999,999: + ``read_csv(..., skiprows=1000000, nrows=999999)`` + na_values : Hashable, Iterable of Hashable or dict of {{Hashable : Iterable}}, optional + Additional strings to recognize as ``NA``/``NaN``. If ``dict`` passed, specific + per-column ``NA`` values. By default the following values are interpreted as + ``NaN``: empty string, "NaN", "N/A", "NULL", and other common representations of missing data. + keep_default_na : bool, default True + Whether or not to include the default ``NaN`` values when parsing the data. + Depending on whether ``na_values`` is passed in, the behavior is as follows: + + * If ``keep_default_na`` is ``True``, and ``na_values`` are specified, ``na_values`` + is appended to the default ``NaN`` values used for parsing. + * If ``keep_default_na`` is ``True``, and ``na_values`` are not specified, only + the default ``NaN`` values are used for parsing. + * If ``keep_default_na`` is ``False``, and ``na_values`` are specified, only + the ``NaN`` values specified ``na_values`` are used for parsing. + * If ``keep_default_na`` is ``False``, and ``na_values`` are not specified, no + strings will be parsed as ``NaN``. + + Note that if ``na_filter`` is passed in as ``False``, the ``keep_default_na`` and + ``na_values`` parameters will be ignored. + na_filter : bool, default True + Detect missing value markers (empty strings and the value of ``na_values``). In + data without any ``NA`` values, passing ``na_filter=False`` can improve the + performance of reading a large file. + skip_blank_lines : bool, default True + If ``True``, skip over blank lines rather than interpreting as ``NaN`` values. + parse_dates : bool, None, list of Hashable, default None + The behavior is as follows: + + * ``bool``. If ``True`` -> try parsing the index. + * ``None``. Behaves like ``True`` if ``date_format`` is specified. + * ``list`` of ``int`` or names. e.g. If ``[1, 2, 3]`` -> try parsing columns 1, 2, 3 + each as a separate date column. + + If a column or index cannot be represented as an array of ``datetime``, + say because of an unparsable value or a mixture of timezones, the column + or index will be returned unaltered as an ``object`` data type. For + non-standard ``datetime`` parsing, use :func:`~pandas.to_datetime` after + :func:`~pandas.read_csv`. + + Note: A fast-path exists for iso8601-formatted dates. + date_format : str or dict of column -> format, optional + Format to use for parsing dates and/or times when used in conjunction with ``parse_dates``. + The strftime to parse time, e.g. :const:`"%d/%m/%Y"`. See + `strftime documentation + `_ for more information on choices, though + note that :const:`"%f"`` will parse all the way up to nanoseconds. + You can also pass: + + - "ISO8601", to parse any `ISO8601 `_ + time string (not necessarily in exactly the same format); + - "mixed", to infer the format for each element individually. This is risky, + and you should probably use it along with `dayfirst`. + + .. versionadded:: 2.0.0 + dayfirst : bool, default False + DD/MM format dates, international and European format. + cache_dates : bool, default True + If ``True``, use a cache of unique, converted dates to apply the ``datetime`` + conversion. May produce significant speed-up when parsing duplicate + date strings, especially ones with timezone offsets. + + iterator : bool, default False + Return ``TextFileReader`` object for iteration or getting chunks with + ``get_chunk()``. + chunksize : int, optional + Number of lines to read from the file per chunk. Passing a value will cause the + function to return a ``TextFileReader`` object for iteration. + See the `IO Tools docs + `_ + for more information on ``iterator`` and ``chunksize``. + + compression : str or dict, default 'infer' + For on-the-fly decompression of on-disk data. If 'infer' and 'filepath_or_buffer' is + path-like, then detect compression from the following extensions: '.gz', + '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2' + (otherwise no compression). + If using 'zip' or 'tar', the ZIP file must contain only one data file to be read in. + Set to ``None`` for no decompression. + Can also be a dict with key ``'method'`` set + to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``} and + other key-value pairs are forwarded to + ``zipfile.ZipFile``, ``gzip.GzipFile``, + ``bz2.BZ2File``, ``zstandard.ZstdDecompressor``, ``lzma.LZMAFile`` or + ``tarfile.TarFile``, respectively. + As an example, the following could be passed for Zstandard decompression using a + custom compression dictionary: + ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``. + + .. versionadded:: 1.5.0 + Added support for `.tar` files. + + thousands : str (length 1), optional + Character acting as the thousands separator in numerical values. + decimal : str (length 1), default '.' + Character to recognize as decimal point (e.g., use ',' for European data). + lineterminator : str (length 1), optional + Character used to denote a line break. Only valid with C parser. + quotechar : str (length 1), optional + Character used to denote the start and end of a quoted item. Quoted + items can include the ``delimiter`` and it will be ignored. + quoting : {{0 or csv.QUOTE_MINIMAL, 1 or csv.QUOTE_ALL, 2 or csv.QUOTE_NONNUMERIC, 3 or csv.QUOTE_NONE}}, default csv.QUOTE_MINIMAL + Control field quoting behavior per ``csv.QUOTE_*`` constants. Default is + ``csv.QUOTE_MINIMAL`` (i.e., 0) which implies that only fields containing special + characters are quoted (e.g., characters defined in ``quotechar``, ``delimiter``, + or ``lineterminator``. + doublequote : bool, default True + When ``quotechar`` is specified and ``quoting`` is not ``QUOTE_NONE``, indicate + whether or not to interpret two consecutive ``quotechar`` elements INSIDE a + field as a single ``quotechar`` element. + escapechar : str (length 1), optional + Character used to escape other characters. + comment : str (length 1), optional + Character indicating that the remainder of line should not be parsed. + If found at the beginning + of a line, the line will be ignored altogether. This parameter must be a + single character. Like empty lines (as long as ``skip_blank_lines=True``), + fully commented lines are ignored by the parameter ``header`` but not by + ``skiprows``. For example, if ``comment='#'``, parsing + ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in ``'a,b,c'`` being + treated as the header. + encoding : str, optional, default 'utf-8' + Encoding to use for UTF when reading/writing (ex. ``'utf-8'``). `List of Python + standard encodings + `_ . + + encoding_errors : str, optional, default 'strict' + How encoding errors are treated. `List of possible values + `_ . + + .. versionadded:: 1.3.0 + + dialect : str or csv.Dialect, optional + If provided, this parameter will override values (default or not) for the + following parameters: ``delimiter``, ``doublequote``, ``escapechar``, + ``skipinitialspace``, ``quotechar``, and ``quoting``. If it is necessary to + override values, a ``ParserWarning`` will be issued. See ``csv.Dialect`` + documentation for more details. + on_bad_lines : {{'error', 'warn', 'skip'}} or Callable, default 'error' + Specifies what to do upon encountering a bad line (a line with too many fields). + Allowed values are: + + - ``'error'``, raise an Exception when a bad line is encountered. + - ``'warn'``, raise a warning when a bad line is encountered and skip that line. + - ``'skip'``, skip bad lines without raising or warning when they are encountered. + - Callable, function that will process a single bad line. + - With ``engine='python'``, function with signature + ``(bad_line: list[str]) -> list[str] | None``. + ``bad_line`` is a list of strings split by the ``sep``. + If the function returns ``None``, the bad line will be ignored. + If the function returns a new ``list`` of strings with more elements than + expected, a ``ParserWarning`` will be emitted while dropping extra elements. + - With ``engine='pyarrow'``, function with signature + as described in pyarrow documentation: `invalid_row_handler + `_. + + .. versionadded:: 1.3.0 + + .. versionadded:: 1.4.0 + + Callable + + .. versionchanged:: 2.2.0 + + Callable for ``engine='pyarrow'`` + + low_memory : bool, default True + Internally process the file in chunks, resulting in lower memory use + while parsing, but possibly mixed type inference. To ensure no mixed + types either set ``False``, or specify the type with the ``dtype`` parameter. + Note that the entire file is read into a single :class:`~pandas.DataFrame` + regardless, use the ``chunksize`` or ``iterator`` parameter to return the data in + chunks. (Only valid with C parser). + memory_map : bool, default False + If a filepath is provided for ``filepath_or_buffer``, map the file object + directly onto memory and access the data directly from there. Using this + option can improve performance because there is no longer any I/O overhead. + float_precision : {{'high', 'legacy', 'round_trip'}}, optional + Specifies which converter the C engine should use for floating-point + values. The options are ``None`` or ``'high'`` for the ordinary converter, + ``'legacy'`` for the original lower precision pandas converter, and + ``'round_trip'`` for the round-trip converter. + + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc. For HTTP(S) URLs the key-value pairs + are forwarded to ``urllib.request.Request`` as header options. For other + URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are + forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more + details, and for more examples on storage options refer `here + `_. + + dtype_backend : {{'numpy_nullable', 'pyarrow'}} + Back-end data type applied to the resultant :class:`DataFrame` + (still experimental). If not specified, the default behavior + is to not use nullable data types. If specified, the behavior + is as follows: + + * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` + * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` :class:`DataFrame` + + .. versionadded:: 2.0 + + Returns + ------- + DataFrame or TextFileReader + A comma-separated values (csv) file is returned as two-dimensional + data structure with labeled axes. + + See Also + -------- + DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file. + read_table : Read general delimited file into DataFrame. + read_fwf : Read a table of fixed-width formatted lines into DataFrame. + + Examples + -------- + >>> pd.read_csv("data.csv") # doctest: +SKIP + Name Value + 0 foo 1 + 1 bar 2 + 2 #baz 3 + + Index and header can be specified via the `index_col` and `header` arguments. + + >>> pd.read_csv("data.csv", header=None) # doctest: +SKIP + 0 1 + 0 Name Value + 1 foo 1 + 2 bar 2 + 3 #baz 3 + + >>> pd.read_csv("data.csv", index_col="Value") # doctest: +SKIP + Name + Value + 1 foo + 2 bar + 3 #baz + + Column types are inferred but can be explicitly specified using the dtype argument. + + >>> pd.read_csv("data.csv", dtype={{"Value": float}}) # doctest: +SKIP + Name Value + 0 foo 1.0 + 1 bar 2.0 + 2 #baz 3.0 + + True, False, and NA values, and thousands separators have defaults, + but can be explicitly specified, too. Supply the values you would like + as strings or lists of strings! + + >>> pd.read_csv("data.csv", na_values=["foo", "bar"]) # doctest: +SKIP + Name Value + 0 NaN 1 + 1 NaN 2 + 2 #baz 3 + + Comment lines in the input file can be skipped using the `comment` argument. + + >>> pd.read_csv("data.csv", comment="#") # doctest: +SKIP + Name Value + 0 foo 1 + 1 bar 2 + + By default, columns with dates will be read as ``object`` rather than ``datetime``. + + >>> df = pd.read_csv("tmp.csv") # doctest: +SKIP + + >>> df # doctest: +SKIP + col 1 col 2 col 3 + 0 10 10/04/2018 Sun 15 Jan 2023 + 1 20 15/04/2018 Fri 12 May 2023 + + >>> df.dtypes # doctest: +SKIP + col 1 int64 + col 2 object + col 3 object + dtype: object + + Specific columns can be parsed as dates by using the `parse_dates` and + `date_format` arguments. + + >>> df = pd.read_csv( + ... "tmp.csv", + ... parse_dates=[1, 2], + ... date_format={{"col 2": "%d/%m/%Y", "col 3": "%a %d %b %Y"}}, + ... ) # doctest: +SKIP + + >>> df.dtypes # doctest: +SKIP + col 1 int64 + col 2 datetime64[ns] + col 3 datetime64[ns] + dtype: object + """ # locals() should never be modified kwds = locals().copy() del kwds["filepath_or_buffer"] @@ -894,23 +1310,6 @@ def read_table( ) -> DataFrame | TextFileReader: ... -@Appender( - _doc_read_csv_and_table.format( - func_name="read_table", - summary="Read general delimited file into DataFrame.", - see_also_func_name="read_csv", - see_also_func_summary=( - "Read a comma-separated values (csv) file into DataFrame." - ), - na_values_str=fill( - '", "'.join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" " - ), - _default_sep=r"'\\t' (tab-stop)", - storage_options=_shared_docs["storage_options"], - decompression_options=_shared_docs["decompression_options"] - % "filepath_or_buffer", - ) -) @set_module("pandas") def read_table( filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], @@ -969,6 +1368,440 @@ def read_table( storage_options: StorageOptions | None = None, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, ) -> DataFrame | TextFileReader: + """ + Read general delimited file into DataFrame. + + Also supports optionally iterating or breaking of the file + into chunks. + + Additional help can be found in the online docs for + `IO Tools `_. + + Parameters + ---------- + filepath_or_buffer : str, path object or file-like object + Any valid string path is acceptable. The string could be a URL. Valid + URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is + expected. A local file could be: file://localhost/path/to/table.csv. + + If you want to pass in a path object, pandas accepts any ``os.PathLike``. + + By file-like object, we refer to objects with a ``read()`` method, such as + a file handle (e.g. via builtin ``open`` function) or ``StringIO``. + sep : str, default '\\t' (tab-stop) + Character or regex pattern to treat as the delimiter. If ``sep=None``, the + C engine cannot automatically detect + the separator, but the Python parsing engine can, meaning the latter will + be used and automatically detect the separator from only the first valid + row of the file by Python's builtin sniffer tool, ``csv.Sniffer``. + In addition, separators longer than 1 character and different from + ``'\\s+'`` will be interpreted as regular expressions and will also force + the use of the Python parsing engine. Note that regex delimiters are prone + to ignoring quoted data. Regex example: ``'\r\t'``. + delimiter : str, optional + Alias for ``sep``. + header : int, Sequence of int, 'infer' or None, default 'infer' + Row number(s) containing column labels and marking the start of the + data (zero-indexed). Default behavior is to infer the column names: if no ``names`` + are passed the behavior is identical to ``header=0`` and column + names are inferred from the first line of the file, if column + names are passed explicitly to ``names`` then the behavior is identical to + ``header=None``. Explicitly pass ``header=0`` to be able to + replace existing names. The header can be a list of integers that + specify row locations for a :class:`~pandas.MultiIndex` on the columns + e.g. ``[0, 1, 3]``. Intervening rows that are not specified will be + skipped (e.g. 2 in this example is skipped). Note that this + parameter ignores commented lines and empty lines if + ``skip_blank_lines=True``, so ``header=0`` denotes the first line of + data rather than the first line of the file. + + When inferred from the file contents, headers are kept distinct from + each other by renaming duplicate names with a numeric suffix of the form + ``".{{count}}"`` starting from 1, e.g. ``"foo"`` and ``"foo.1"``. + Empty headers are named ``"Unnamed: {{i}}"`` or ``"Unnamed: {{i}}_level_{{level}}"`` + in the case of MultiIndex columns. + names : Sequence of Hashable, optional + Sequence of column labels to apply. If the file contains a header row, + then you should explicitly pass ``header=0`` to override the column names. + Duplicates in this list are not allowed. + index_col : Hashable, Sequence of Hashable or False, optional + Column(s) to use as row label(s), denoted either by column labels or column + indices. If a sequence of labels or indices is given, :class:`~pandas.MultiIndex` + will be formed for the row labels. + + Note: ``index_col=False`` can be used to force pandas to *not* use the first + column as the index, e.g., when you have a malformed file with delimiters at + the end of each line. + usecols : Sequence of Hashable or Callable, optional + Subset of columns to select, denoted either by column labels or column indices. + If list-like, all elements must either + be positional (i.e. integer indices into the document columns) or strings + that correspond to column names provided either by the user in ``names`` or + inferred from the document header row(s). If ``names`` are given, the document + header row(s) are not taken into account. For example, a valid list-like + ``usecols`` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``. + Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. + To instantiate a :class:`~pandas.DataFrame` from ``data`` with element order + preserved use ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` + for columns in ``['foo', 'bar']`` order or + ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]`` + for ``['bar', 'foo']`` order. + + If callable, the callable function will be evaluated against the column + names, returning names where the callable function evaluates to ``True``. An + example of a valid callable argument would be ``lambda x: x.upper() in + ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster + parsing time and lower memory usage. + dtype : dtype or dict of {{Hashable : dtype}}, optional + Data type(s) to apply to either the whole dataset or individual columns. + E.g., ``{{'a': np.float64, 'b': np.int32, 'c': 'Int64'}}`` + Use ``str`` or ``object`` together with suitable ``na_values`` settings + to preserve and not interpret ``dtype``. + If ``converters`` are specified, they will be applied INSTEAD + of ``dtype`` conversion. + + .. versionadded:: 1.5.0 + + Support for ``defaultdict`` was added. Specify a ``defaultdict`` as input where + the default determines the ``dtype`` of the columns which are not explicitly + listed. + engine : {{'c', 'python', 'pyarrow'}}, optional + Parser engine to use. The C and pyarrow engines are faster, while the python engine + is currently more feature-complete. Multithreading is currently only supported by + the pyarrow engine. + + .. versionadded:: 1.4.0 + + The 'pyarrow' engine was added as an *experimental* engine, and some features + are unsupported, or may not work correctly, with this engine. + converters : dict of {{Hashable : Callable}}, optional + Functions for converting values in specified columns. Keys can either + be column labels or column indices. + true_values : list, optional + Values to consider as ``True`` in addition to case-insensitive variants of 'True'. + false_values : list, optional + Values to consider as ``False`` in addition to case-insensitive variants of 'False'. + skipinitialspace : bool, default False + Skip spaces after delimiter. + skiprows : int, list of int or Callable, optional + Line numbers to skip (0-indexed) or number of lines to skip (``int``) + at the start of the file. + + If callable, the callable function will be evaluated against the row + indices, returning ``True`` if the row should be skipped and ``False`` otherwise. + An example of a valid callable argument would be ``lambda x: x in [0, 2]``. + skipfooter : int, default 0 + Number of lines at bottom of file to skip (Unsupported with ``engine='c'``). + nrows : int, optional + Number of rows of file to read. Useful for reading pieces of large files. + Refers to the number of data rows in the returned DataFrame, excluding: + + * The header row containing column names. + * Rows before the header row, if ``header=1`` or larger. + + Example usage: + + * To read the first 999,999 (non-header) rows: + ``read_csv(..., nrows=999999)`` + + * To read rows 1,000,000 through 1,999,999: + ``read_csv(..., skiprows=1000000, nrows=999999)`` + na_values : Hashable, Iterable of Hashable or dict of {{Hashable : Iterable}}, optional + Additional strings to recognize as ``NA``/``NaN``. If ``dict`` passed, specific + per-column ``NA`` values. By default the following values are interpreted as + ``NaN``: empty string, "NaN", "N/A", "NULL", and other common representations of missing data. + keep_default_na : bool, default True + Whether or not to include the default ``NaN`` values when parsing the data. + Depending on whether ``na_values`` is passed in, the behavior is as follows: + + * If ``keep_default_na`` is ``True``, and ``na_values`` are specified, ``na_values`` + is appended to the default ``NaN`` values used for parsing. + * If ``keep_default_na`` is ``True``, and ``na_values`` are not specified, only + the default ``NaN`` values are used for parsing. + * If ``keep_default_na`` is ``False``, and ``na_values`` are specified, only + the ``NaN`` values specified ``na_values`` are used for parsing. + * If ``keep_default_na`` is ``False``, and ``na_values`` are not specified, no + strings will be parsed as ``NaN``. + + Note that if ``na_filter`` is passed in as ``False``, the ``keep_default_na`` and + ``na_values`` parameters will be ignored. + na_filter : bool, default True + Detect missing value markers (empty strings and the value of ``na_values``). In + data without any ``NA`` values, passing ``na_filter=False`` can improve the + performance of reading a large file. + skip_blank_lines : bool, default True + If ``True``, skip over blank lines rather than interpreting as ``NaN`` values. + parse_dates : bool, None, list of Hashable, default None + The behavior is as follows: + + * ``bool``. If ``True`` -> try parsing the index. + * ``None``. Behaves like ``True`` if ``date_format`` is specified. + * ``list`` of ``int`` or names. e.g. If ``[1, 2, 3]`` -> try parsing columns 1, 2, 3 + each as a separate date column. + + If a column or index cannot be represented as an array of ``datetime``, + say because of an unparsable value or a mixture of timezones, the column + or index will be returned unaltered as an ``object`` data type. For + non-standard ``datetime`` parsing, use :func:`~pandas.to_datetime` after + :func:`~pandas.read_csv`. + + Note: A fast-path exists for iso8601-formatted dates. + date_format : str or dict of column -> format, optional + Format to use for parsing dates and/or times when used in conjunction with ``parse_dates``. + The strftime to parse time, e.g. :const:`"%d/%m/%Y"`. See + `strftime documentation + `_ for more information on choices, though + note that :const:`"%f"`` will parse all the way up to nanoseconds. + You can also pass: + + - "ISO8601", to parse any `ISO8601 `_ + time string (not necessarily in exactly the same format); + - "mixed", to infer the format for each element individually. This is risky, + and you should probably use it along with `dayfirst`. + + .. versionadded:: 2.0.0 + dayfirst : bool, default False + DD/MM format dates, international and European format. + cache_dates : bool, default True + If ``True``, use a cache of unique, converted dates to apply the ``datetime`` + conversion. May produce significant speed-up when parsing duplicate + date strings, especially ones with timezone offsets. + + iterator : bool, default False + Return ``TextFileReader`` object for iteration or getting chunks with + ``get_chunk()``. + chunksize : int, optional + Number of lines to read from the file per chunk. Passing a value will cause the + function to return a ``TextFileReader`` object for iteration. + See the `IO Tools docs + `_ + for more information on ``iterator`` and ``chunksize``. + + compression : str or dict, default 'infer' + For on-the-fly decompression of on-disk data. If 'infer' and 'filepath_or_buffer' is + path-like, then detect compression from the following extensions: '.gz', + '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2' + (otherwise no compression). + If using 'zip' or 'tar', the ZIP file must contain only one data file to be read in. + Set to ``None`` for no decompression. + Can also be a dict with key ``'method'`` set + to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``} and + other key-value pairs are forwarded to + ``zipfile.ZipFile``, ``gzip.GzipFile``, + ``bz2.BZ2File``, ``zstandard.ZstdDecompressor``, ``lzma.LZMAFile`` or + ``tarfile.TarFile``, respectively. + As an example, the following could be passed for Zstandard decompression using a + custom compression dictionary: + ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``. + + .. versionadded:: 1.5.0 + Added support for `.tar` files. + + thousands : str (length 1), optional + Character acting as the thousands separator in numerical values. + decimal : str (length 1), default '.' + Character to recognize as decimal point (e.g., use ',' for European data). + lineterminator : str (length 1), optional + Character used to denote a line break. Only valid with C parser. + quotechar : str (length 1), optional + Character used to denote the start and end of a quoted item. Quoted + items can include the ``delimiter`` and it will be ignored. + quoting : {{0 or csv.QUOTE_MINIMAL, 1 or csv.QUOTE_ALL, 2 or csv.QUOTE_NONNUMERIC, 3 or csv.QUOTE_NONE}}, default csv.QUOTE_MINIMAL + Control field quoting behavior per ``csv.QUOTE_*`` constants. Default is + ``csv.QUOTE_MINIMAL`` (i.e., 0) which implies that only fields containing special + characters are quoted (e.g., characters defined in ``quotechar``, ``delimiter``, + or ``lineterminator``. + doublequote : bool, default True + When ``quotechar`` is specified and ``quoting`` is not ``QUOTE_NONE``, indicate + whether or not to interpret two consecutive ``quotechar`` elements INSIDE a + field as a single ``quotechar`` element. + escapechar : str (length 1), optional + Character used to escape other characters. + comment : str (length 1), optional + Character indicating that the remainder of line should not be parsed. + If found at the beginning + of a line, the line will be ignored altogether. This parameter must be a + single character. Like empty lines (as long as ``skip_blank_lines=True``), + fully commented lines are ignored by the parameter ``header`` but not by + ``skiprows``. For example, if ``comment='#'``, parsing + ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in ``'a,b,c'`` being + treated as the header. + encoding : str, optional, default 'utf-8' + Encoding to use for UTF when reading/writing (ex. ``'utf-8'``). `List of Python + standard encodings + `_ . + + encoding_errors : str, optional, default 'strict' + How encoding errors are treated. `List of possible values + `_ . + + .. versionadded:: 1.3.0 + + dialect : str or csv.Dialect, optional + If provided, this parameter will override values (default or not) for the + following parameters: ``delimiter``, ``doublequote``, ``escapechar``, + ``skipinitialspace``, ``quotechar``, and ``quoting``. If it is necessary to + override values, a ``ParserWarning`` will be issued. See ``csv.Dialect`` + documentation for more details. + on_bad_lines : {{'error', 'warn', 'skip'}} or Callable, default 'error' + Specifies what to do upon encountering a bad line (a line with too many fields). + Allowed values are: + + - ``'error'``, raise an Exception when a bad line is encountered. + - ``'warn'``, raise a warning when a bad line is encountered and skip that line. + - ``'skip'``, skip bad lines without raising or warning when they are encountered. + - Callable, function that will process a single bad line. + - With ``engine='python'``, function with signature + ``(bad_line: list[str]) -> list[str] | None``. + ``bad_line`` is a list of strings split by the ``sep``. + If the function returns ``None``, the bad line will be ignored. + If the function returns a new ``list`` of strings with more elements than + expected, a ``ParserWarning`` will be emitted while dropping extra elements. + - With ``engine='pyarrow'``, function with signature + as described in pyarrow documentation: `invalid_row_handler + `_. + + .. versionadded:: 1.3.0 + + .. versionadded:: 1.4.0 + + Callable + + .. versionchanged:: 2.2.0 + + Callable for ``engine='pyarrow'`` + + low_memory : bool, default True + Internally process the file in chunks, resulting in lower memory use + while parsing, but possibly mixed type inference. To ensure no mixed + types either set ``False``, or specify the type with the ``dtype`` parameter. + Note that the entire file is read into a single :class:`~pandas.DataFrame` + regardless, use the ``chunksize`` or ``iterator`` parameter to return the data in + chunks. (Only valid with C parser). + memory_map : bool, default False + If a filepath is provided for ``filepath_or_buffer``, map the file object + directly onto memory and access the data directly from there. Using this + option can improve performance because there is no longer any I/O overhead. + float_precision : {{'high', 'legacy', 'round_trip'}}, optional + Specifies which converter the C engine should use for floating-point + values. The options are ``None`` or ``'high'`` for the ordinary converter, + ``'legacy'`` for the original lower precision pandas converter, and + ``'round_trip'`` for the round-trip converter. + + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc. For HTTP(S) URLs the key-value pairs + are forwarded to ``urllib.request.Request`` as header options. For other + URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are + forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more + details, and for more examples on storage options refer `here + `_. + + dtype_backend : {{'numpy_nullable', 'pyarrow'}} + Back-end data type applied to the resultant :class:`DataFrame` + (still experimental). If not specified, the default behavior + is to not use nullable data types. If specified, the behavior + is as follows: + + * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` + * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` :class:`DataFrame` + + .. versionadded:: 2.0 + + Returns + ------- + DataFrame or TextFileReader + A comma-separated values (csv) file is returned as two-dimensional + data structure with labeled axes. + + See Also + -------- + DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file. + read_csv : Read a comma-separated values (csv) file into DataFrame. + read_fwf : Read a table of fixed-width formatted lines into DataFrame. + + Examples + -------- + >>> pd.read_table("data.csv") # doctest: +SKIP + Name Value + 0 foo 1 + 1 bar 2 + 2 #baz 3 + + Index and header can be specified via the `index_col` and `header` arguments. + + >>> pd.read_table("data.csv", header=None) # doctest: +SKIP + 0 1 + 0 Name Value + 1 foo 1 + 2 bar 2 + 3 #baz 3 + + >>> pd.read_table("data.csv", index_col="Value") # doctest: +SKIP + Name + Value + 1 foo + 2 bar + 3 #baz + + Column types are inferred but can be explicitly specified using the dtype argument. + + >>> pd.read_table("data.csv", dtype={{"Value": float}}) # doctest: +SKIP + Name Value + 0 foo 1.0 + 1 bar 2.0 + 2 #baz 3.0 + + True, False, and NA values, and thousands separators have defaults, + but can be explicitly specified, too. Supply the values you would like + as strings or lists of strings! + + >>> pd.read_table("data.csv", na_values=["foo", "bar"]) # doctest: +SKIP + Name Value + 0 NaN 1 + 1 NaN 2 + 2 #baz 3 + + Comment lines in the input file can be skipped using the `comment` argument. + + >>> pd.read_table("data.csv", comment="#") # doctest: +SKIP + Name Value + 0 foo 1 + 1 bar 2 + + By default, columns with dates will be read as ``object`` rather than ``datetime``. + + >>> df = pd.read_table("tmp.csv") # doctest: +SKIP + + >>> df # doctest: +SKIP + col 1 col 2 col 3 + 0 10 10/04/2018 Sun 15 Jan 2023 + 1 20 15/04/2018 Fri 12 May 2023 + + >>> df.dtypes # doctest: +SKIP + col 1 int64 + col 2 object + col 3 object + dtype: object + + Specific columns can be parsed as dates by using the `parse_dates` and + `date_format` arguments. + + >>> df = pd.read_table( + ... "tmp.csv", + ... parse_dates=[1, 2], + ... date_format={{"col 2": "%d/%m/%Y", "col 3": "%a %d %b %Y"}}, + ... ) # doctest: +SKIP + + >>> df.dtypes # doctest: +SKIP + col 1 int64 + col 2 datetime64[ns] + col 3 datetime64[ns] + dtype: object + """ # locals() should never be modified kwds = locals().copy() del kwds["filepath_or_buffer"] @@ -1939,4 +2772,4 @@ def _validate_skipfooter(kwds: dict[str, Any]) -> None: if kwds.get("iterator") or kwds.get("chunksize"): raise ValueError("'skipfooter' not supported for iteration") if kwds.get("nrows"): - raise ValueError("'skipfooter' not supported with 'nrows'") \ No newline at end of file + raise ValueError("'skipfooter' not supported with 'nrows'") From 41e4b6bfb387b8a77f76f85529ab9ede0f0d0324 Mon Sep 17 00:00:00 2001 From: huhu-dsy Date: Thu, 16 Oct 2025 23:14:06 +0800 Subject: [PATCH 12/24] refactor: replace @Appender with hardcoded docstrings for read_csv and read_table --- pandas/io/parsers/readers.py | 198 +++++++++++++++++++++++------------ 1 file changed, 132 insertions(+), 66 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 5dda6dddf15ec..f011f8e2754b8 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -850,7 +850,8 @@ def read_csv( Alias for ``sep``. header : int, Sequence of int, 'infer' or None, default 'infer' Row number(s) containing column labels and marking the start of the - data (zero-indexed). Default behavior is to infer the column names: if no ``names`` + data (zero-indexed). Default behavior is to infer the column names: + if no ``names`` are passed the behavior is identical to ``header=0`` and column names are inferred from the first line of the file, if column names are passed explicitly to ``names`` then the behavior is identical to @@ -866,7 +867,8 @@ def read_csv( When inferred from the file contents, headers are kept distinct from each other by renaming duplicate names with a numeric suffix of the form ``".{{count}}"`` starting from 1, e.g. ``"foo"`` and ``"foo.1"``. - Empty headers are named ``"Unnamed: {{i}}"`` or ``"Unnamed: {{i}}_level_{{level}}"`` + Empty headers are named ``"Unnamed: {{i}}"`` or `` + "Unnamed: {{i}}_level_{{level}}"`` in the case of MultiIndex columns. names : Sequence of Hashable, optional Sequence of column labels to apply. If the file contains a header row, @@ -874,18 +876,21 @@ def read_csv( Duplicates in this list are not allowed. index_col : Hashable, Sequence of Hashable or False, optional Column(s) to use as row label(s), denoted either by column labels or column - indices. If a sequence of labels or indices is given, :class:`~pandas.MultiIndex` + indices. If a sequence of labels or indices is given, + :class:`~pandas.MultiIndex` will be formed for the row labels. Note: ``index_col=False`` can be used to force pandas to *not* use the first column as the index, e.g., when you have a malformed file with delimiters at the end of each line. usecols : Sequence of Hashable or Callable, optional - Subset of columns to select, denoted either by column labels or column indices. + Subset of columns to select, denoted either + by column labels or column indices. If list-like, all elements must either be positional (i.e. integer indices into the document columns) or strings that correspond to column names provided either by the user in ``names`` or - inferred from the document header row(s). If ``names`` are given, the document + inferred from the document header row(s). + If ``names`` are given, the document header row(s) are not taken into account. For example, a valid list-like ``usecols`` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``. Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. @@ -910,25 +915,32 @@ def read_csv( .. versionadded:: 1.5.0 - Support for ``defaultdict`` was added. Specify a ``defaultdict`` as input where - the default determines the ``dtype`` of the columns which are not explicitly + Support for ``defaultdict`` was + added. Specify a ``defaultdict`` as input where + the default determines the ``dtype`` + of the columns which are not explicitly listed. engine : {{'c', 'python', 'pyarrow'}}, optional - Parser engine to use. The C and pyarrow engines are faster, while the python engine - is currently more feature-complete. Multithreading is currently only supported by + Parser engine to use. The C and pyarrow engines are faster, + while the python engine + is currently more feature-complete. Multithreading + is currently only supported by the pyarrow engine. .. versionadded:: 1.4.0 - The 'pyarrow' engine was added as an *experimental* engine, and some features + The 'pyarrow' engine was added as an *experimental* engine, + and some features are unsupported, or may not work correctly, with this engine. converters : dict of {{Hashable : Callable}}, optional Functions for converting values in specified columns. Keys can either be column labels or column indices. true_values : list, optional - Values to consider as ``True`` in addition to case-insensitive variants of 'True'. + Values to consider as ``True`` in addition + to case-insensitive variants of 'True'. false_values : list, optional - Values to consider as ``False`` in addition to case-insensitive variants of 'False'. + Values to consider as ``False`` in addition to case-insensitive + variants of 'False'. skipinitialspace : bool, default False Skip spaces after delimiter. skiprows : int, list of int or Callable, optional @@ -936,7 +948,8 @@ def read_csv( at the start of the file. If callable, the callable function will be evaluated against the row - indices, returning ``True`` if the row should be skipped and ``False`` otherwise. + indices, returning ``True`` if the row should be skipped and ``False`` + otherwise. An example of a valid callable argument would be ``lambda x: x in [0, 2]``. skipfooter : int, default 0 Number of lines at bottom of file to skip (Unsupported with ``engine='c'``). @@ -954,15 +967,20 @@ def read_csv( * To read rows 1,000,000 through 1,999,999: ``read_csv(..., skiprows=1000000, nrows=999999)`` - na_values : Hashable, Iterable of Hashable or dict of {{Hashable : Iterable}}, optional - Additional strings to recognize as ``NA``/``NaN``. If ``dict`` passed, specific - per-column ``NA`` values. By default the following values are interpreted as - ``NaN``: empty string, "NaN", "N/A", "NULL", and other common representations of missing data. + na_values : Hashable, Iterable of Hashable or dict of {{Hashable : Iterable}}, + optional + Additional strings to recognize as ``NA``/``NaN``. If ``dict`` + passed, specific + per-column ``NA`` values. By default the following values + are interpreted as + ``NaN``: empty string, "NaN", "N/A", "NULL", and other common + representations of missing data. keep_default_na : bool, default True Whether or not to include the default ``NaN`` values when parsing the data. Depending on whether ``na_values`` is passed in, the behavior is as follows: - * If ``keep_default_na`` is ``True``, and ``na_values`` are specified, ``na_values`` + * If ``keep_default_na`` is ``True``, and ``na_values`` + are specified, ``na_values`` is appended to the default ``NaN`` values used for parsing. * If ``keep_default_na`` is ``True``, and ``na_values`` are not specified, only the default ``NaN`` values are used for parsing. @@ -971,7 +989,8 @@ def read_csv( * If ``keep_default_na`` is ``False``, and ``na_values`` are not specified, no strings will be parsed as ``NaN``. - Note that if ``na_filter`` is passed in as ``False``, the ``keep_default_na`` and + Note that if ``na_filter`` is passed in as ``False``, + the ``keep_default_na`` and ``na_values`` parameters will be ignored. na_filter : bool, default True Detect missing value markers (empty strings and the value of ``na_values``). In @@ -984,7 +1003,8 @@ def read_csv( * ``bool``. If ``True`` -> try parsing the index. * ``None``. Behaves like ``True`` if ``date_format`` is specified. - * ``list`` of ``int`` or names. e.g. If ``[1, 2, 3]`` -> try parsing columns 1, 2, 3 + * ``list`` of ``int`` or + names. e.g. If ``[1, 2, 3]`` -> try parsing columns 1, 2, 3 each as a separate date column. If a column or index cannot be represented as an array of ``datetime``, @@ -995,7 +1015,8 @@ def read_csv( Note: A fast-path exists for iso8601-formatted dates. date_format : str or dict of column -> format, optional - Format to use for parsing dates and/or times when used in conjunction with ``parse_dates``. + Format to use for parsing dates and/or times when + used in conjunction with ``parse_dates``. The strftime to parse time, e.g. :const:`"%d/%m/%Y"`. See `strftime documentation list[str] | None``. ``bad_line`` is a list of strings split by the ``sep``. If the function returns ``None``, the bad line will be ignored. - If the function returns a new ``list`` of strings with more elements than - expected, a ``ParserWarning`` will be emitted while dropping extra elements. + If the function returns a new ``list`` of strings with + more elements than + expected, a ``ParserWarning`` will be emitted while + dropping extra elements. - With ``engine='pyarrow'``, function with signature as described in pyarrow documentation: `invalid_row_handler - `_. .. versionadded:: 1.3.0 @@ -1126,7 +1159,8 @@ def read_csv( while parsing, but possibly mixed type inference. To ensure no mixed types either set ``False``, or specify the type with the ``dtype`` parameter. Note that the entire file is read into a single :class:`~pandas.DataFrame` - regardless, use the ``chunksize`` or ``iterator`` parameter to return the data in + regardless, use the ``chunksize`` or ``iterator`` + parameter to return the data in chunks. (Only valid with C parser). memory_map : bool, default False If a filepath is provided for ``filepath_or_buffer``, map the file object @@ -1155,7 +1189,8 @@ def read_csv( is as follows: * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` :class:`DataFrame` + * ``"pyarrow"``: returns + pyarrow-backed nullable :class:`ArrowDtype` :class:`DataFrame` .. versionadded:: 2.0 @@ -1402,7 +1437,8 @@ def read_table( Alias for ``sep``. header : int, Sequence of int, 'infer' or None, default 'infer' Row number(s) containing column labels and marking the start of the - data (zero-indexed). Default behavior is to infer the column names: if no ``names`` + data (zero-indexed). Default behavior + is to infer the column names: if no ``names`` are passed the behavior is identical to ``header=0`` and column names are inferred from the first line of the file, if column names are passed explicitly to ``names`` then the behavior is identical to @@ -1418,7 +1454,8 @@ def read_table( When inferred from the file contents, headers are kept distinct from each other by renaming duplicate names with a numeric suffix of the form ``".{{count}}"`` starting from 1, e.g. ``"foo"`` and ``"foo.1"``. - Empty headers are named ``"Unnamed: {{i}}"`` or ``"Unnamed: {{i}}_level_{{level}}"`` + Empty headers are named + ``"Unnamed: {{i}}"`` or ``"Unnamed: {{i}}_level_{{level}}"`` in the case of MultiIndex columns. names : Sequence of Hashable, optional Sequence of column labels to apply. If the file contains a header row, @@ -1462,25 +1499,32 @@ def read_table( .. versionadded:: 1.5.0 - Support for ``defaultdict`` was added. Specify a ``defaultdict`` as input where - the default determines the ``dtype`` of the columns which are not explicitly + Support for ``defaultdict`` was added. + Specify a ``defaultdict`` as input where + the default determines the ``dtype`` of the columns which + are not explicitly listed. engine : {{'c', 'python', 'pyarrow'}}, optional - Parser engine to use. The C and pyarrow engines are faster, while the python engine - is currently more feature-complete. Multithreading is currently only supported by + Parser engine to use. The C and pyarrow engines are faster, + while the python engine + is currently more feature-complete. Multithreading is + currently only supported by the pyarrow engine. .. versionadded:: 1.4.0 - The 'pyarrow' engine was added as an *experimental* engine, and some features + The 'pyarrow' engine was added as an *experimental* engine, + and some features are unsupported, or may not work correctly, with this engine. converters : dict of {{Hashable : Callable}}, optional Functions for converting values in specified columns. Keys can either be column labels or column indices. true_values : list, optional - Values to consider as ``True`` in addition to case-insensitive variants of 'True'. + Values to consider as ``True`` in addition to + case-insensitive variants of 'True'. false_values : list, optional - Values to consider as ``False`` in addition to case-insensitive variants of 'False'. + Values to consider as ``False`` in addition + to case-insensitive variants of 'False'. skipinitialspace : bool, default False Skip spaces after delimiter. skiprows : int, list of int or Callable, optional @@ -1488,7 +1532,8 @@ def read_table( at the start of the file. If callable, the callable function will be evaluated against the row - indices, returning ``True`` if the row should be skipped and ``False`` otherwise. + indices, returning ``True`` if the row + should be skipped and ``False`` otherwise. An example of a valid callable argument would be ``lambda x: x in [0, 2]``. skipfooter : int, default 0 Number of lines at bottom of file to skip (Unsupported with ``engine='c'``). @@ -1506,15 +1551,19 @@ def read_table( * To read rows 1,000,000 through 1,999,999: ``read_csv(..., skiprows=1000000, nrows=999999)`` - na_values : Hashable, Iterable of Hashable or dict of {{Hashable : Iterable}}, optional - Additional strings to recognize as ``NA``/``NaN``. If ``dict`` passed, specific + na_values : Hashable, Iterable of Hashable or + dict of {{Hashable : Iterable}}, optional + Additional strings to recognize as ``NA``/``NaN``. + If ``dict`` passed, specific per-column ``NA`` values. By default the following values are interpreted as - ``NaN``: empty string, "NaN", "N/A", "NULL", and other common representations of missing data. + ``NaN``: empty string, "NaN", "N/A", "NULL", and other + common representations of missing data. keep_default_na : bool, default True Whether or not to include the default ``NaN`` values when parsing the data. Depending on whether ``na_values`` is passed in, the behavior is as follows: - * If ``keep_default_na`` is ``True``, and ``na_values`` are specified, ``na_values`` + * If ``keep_default_na`` is ``True``, + and ``na_values`` are specified, ``na_values`` is appended to the default ``NaN`` values used for parsing. * If ``keep_default_na`` is ``True``, and ``na_values`` are not specified, only the default ``NaN`` values are used for parsing. @@ -1523,7 +1572,8 @@ def read_table( * If ``keep_default_na`` is ``False``, and ``na_values`` are not specified, no strings will be parsed as ``NaN``. - Note that if ``na_filter`` is passed in as ``False``, the ``keep_default_na`` and + Note that if ``na_filter`` is passed in as + ``False``, the ``keep_default_na`` and ``na_values`` parameters will be ignored. na_filter : bool, default True Detect missing value markers (empty strings and the value of ``na_values``). In @@ -1536,7 +1586,8 @@ def read_table( * ``bool``. If ``True`` -> try parsing the index. * ``None``. Behaves like ``True`` if ``date_format`` is specified. - * ``list`` of ``int`` or names. e.g. If ``[1, 2, 3]`` -> try parsing columns 1, 2, 3 + * ``list`` of ``int`` or names. e.g. If ``[1, 2, 3]`` -> + try parsing columns 1, 2, 3 each as a separate date column. If a column or index cannot be represented as an array of ``datetime``, @@ -1547,7 +1598,8 @@ def read_table( Note: A fast-path exists for iso8601-formatted dates. date_format : str or dict of column -> format, optional - Format to use for parsing dates and/or times when used in conjunction with ``parse_dates``. + Format to use for parsing dates and/or times when used + in conjunction with ``parse_dates``. The strftime to parse time, e.g. :const:`"%d/%m/%Y"`. See `strftime documentation list[str] | None``. ``bad_line`` is a list of strings split by the ``sep``. If the function returns ``None``, the bad line will be ignored. If the function returns a new ``list`` of strings with more elements than - expected, a ``ParserWarning`` will be emitted while dropping extra elements. + expected, a ``ParserWarning`` will be emitted while + dropping extra elements. - With ``engine='pyarrow'``, function with signature as described in pyarrow documentation: `invalid_row_handler - `_. .. versionadded:: 1.3.0 @@ -1678,7 +1742,8 @@ def read_table( while parsing, but possibly mixed type inference. To ensure no mixed types either set ``False``, or specify the type with the ``dtype`` parameter. Note that the entire file is read into a single :class:`~pandas.DataFrame` - regardless, use the ``chunksize`` or ``iterator`` parameter to return the data in + regardless, use the ``chunksize`` or ``iterator`` parameter + to return the data in chunks. (Only valid with C parser). memory_map : bool, default False If a filepath is provided for ``filepath_or_buffer``, map the file object @@ -1707,7 +1772,8 @@ def read_table( is as follows: * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` :class:`DataFrame` + * ``"pyarrow"``: returns pyarrow-backed nullable + :class:`ArrowDtype` :class:`DataFrame` .. versionadded:: 2.0 From 85c1a4fdc23063a433341df077a03d9db87c072c Mon Sep 17 00:00:00 2001 From: huhu-dsy Date: Fri, 17 Oct 2025 00:21:39 +0800 Subject: [PATCH 13/24] refactor: replace @Appender with hardcoded docstrings for read_csv and read_table --- pandas/io/parsers/readers.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index f011f8e2754b8..0429b4533f1ae 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -980,7 +980,7 @@ def read_csv( Depending on whether ``na_values`` is passed in, the behavior is as follows: * If ``keep_default_na`` is ``True``, and ``na_values`` - are specified, ``na_values`` + are specified, ``na_values`` is appended to the default ``NaN`` values used for parsing. * If ``keep_default_na`` is ``True``, and ``na_values`` are not specified, only the default ``NaN`` values are used for parsing. @@ -1003,8 +1003,8 @@ def read_csv( * ``bool``. If ``True`` -> try parsing the index. * ``None``. Behaves like ``True`` if ``date_format`` is specified. - * ``list`` of ``int`` or - names. e.g. If ``[1, 2, 3]`` -> try parsing columns 1, 2, 3 + * ``list`` of ``int`` or names. + e.g. If ``[1, 2, 3]`` -> try parsing columns 1, 2, 3 each as a separate date column. If a column or index cannot be represented as an array of ``datetime``, @@ -1089,9 +1089,9 @@ def read_csv( in ``quotechar``, ``delimiter``, or ``lineterminator``. doublequote : bool, default True - When ``quotechar`` is specified and ``quoting`` is not ``QUOTE_NONE``, indicate - whether or not to interpret two consecutive ``quotechar`` elements INSIDE a - field as a single ``quotechar`` element. + When ``quotechar`` is specified and ``quoting`` is not ``QUOTE_NONE``, indicate + whether or not to interpret two consecutive ``quotechar`` elements INSIDE a + field as a single ``quotechar`` element. escapechar : str (length 1), optional Character used to escape other characters. comment : str (length 1), optional @@ -1126,22 +1126,22 @@ def read_csv( - ``'error'``, raise an Exception when a bad line is encountered. - ``'warn'``, raise a warning when a bad line is - encountered and skip that line. + encountered and skip that line. - ``'skip'``, skip bad lines without raising or warning when - they are encountered. + they are encountered. - Callable, function that will process a single bad line. - With ``engine='python'``, function with signature ``(bad_line: list[str]) -> list[str] | None``. ``bad_line`` is a list of strings split by the ``sep``. If the function returns ``None``, the bad line will be ignored. If the function returns a new ``list`` of strings with - more elements than + more elements than expected, a ``ParserWarning`` will be emitted while - dropping extra elements. + dropping extra elements. - With ``engine='pyarrow'``, function with signature as described in pyarrow documentation: `invalid_row_handler `_. .. versionadded:: 1.3.0 From 79afdfe33f3f32788847c13908f4e3102d6fd944 Mon Sep 17 00:00:00 2001 From: huhu-dsy Date: Fri, 17 Oct 2025 01:11:33 +0800 Subject: [PATCH 14/24] Update readers.py --- pandas/io/parsers/readers.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 0429b4533f1ae..631e972757021 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -967,6 +967,7 @@ def read_csv( * To read rows 1,000,000 through 1,999,999: ``read_csv(..., skiprows=1000000, nrows=999999)`` + na_values : Hashable, Iterable of Hashable or dict of {{Hashable : Iterable}}, optional Additional strings to recognize as ``NA``/``NaN``. If ``dict`` @@ -1551,6 +1552,7 @@ def read_table( * To read rows 1,000,000 through 1,999,999: ``read_csv(..., skiprows=1000000, nrows=999999)`` + na_values : Hashable, Iterable of Hashable or dict of {{Hashable : Iterable}}, optional Additional strings to recognize as ``NA``/``NaN``. @@ -1587,7 +1589,7 @@ def read_table( * ``bool``. If ``True`` -> try parsing the index. * ``None``. Behaves like ``True`` if ``date_format`` is specified. * ``list`` of ``int`` or names. e.g. If ``[1, 2, 3]`` -> - try parsing columns 1, 2, 3 + try parsing columns 1, 2, 3 each as a separate date column. If a column or index cannot be represented as an array of ``datetime``, @@ -1710,9 +1712,9 @@ def read_table( - ``'error'``, raise an Exception when a bad line is encountered. - ``'warn'``, raise a warning when a bad line is encountered and - skip that line. + skip that line. - ``'skip'``, skip bad lines without raising or warning when they - are encountered. + are encountered. - Callable, function that will process a single bad line. - With ``engine='python'``, function with signature ``(bad_line: list[str]) -> list[str] | None``. @@ -1720,7 +1722,7 @@ def read_table( If the function returns ``None``, the bad line will be ignored. If the function returns a new ``list`` of strings with more elements than expected, a ``ParserWarning`` will be emitted while - dropping extra elements. + dropping extra elements. - With ``engine='pyarrow'``, function with signature as described in pyarrow documentation: `invalid_row_handler Date: Fri, 17 Oct 2025 02:05:02 +0800 Subject: [PATCH 15/24] Update readers.py --- pandas/io/parsers/readers.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 631e972757021..9fdb1e22c9cd2 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -969,7 +969,7 @@ def read_csv( ``read_csv(..., skiprows=1000000, nrows=999999)`` na_values : Hashable, Iterable of Hashable or dict of {{Hashable : Iterable}}, - optional + optional Additional strings to recognize as ``NA``/``NaN``. If ``dict`` passed, specific per-column ``NA`` values. By default the following values @@ -1082,7 +1082,7 @@ def read_csv( Character used to denote the start and end of a quoted item. Quoted items can include the ``delimiter`` and it will be ignored. quoting : {{0 or csv.QUOTE_MINIMAL, 1 or csv.QUOTE_ALL, - 2 or csv.QUOTE_NONNUMERIC, 3 or csv.QUOTE_NONE}}, default csv.QUOTE_MINIMAL + 2 or csv.QUOTE_NONNUMERIC, 3 or csv.QUOTE_NONE}}, default csv.QUOTE_MINIMAL Control field quoting behavior per ``csv.QUOTE_*`` constants. Default is ``csv.QUOTE_MINIMAL`` (i.e., 0) which implies that only fields containing special @@ -1553,8 +1553,8 @@ def read_table( * To read rows 1,000,000 through 1,999,999: ``read_csv(..., skiprows=1000000, nrows=999999)`` - na_values : Hashable, Iterable of Hashable or - dict of {{Hashable : Iterable}}, optional + na_values : Hashable, Iterable of Hashable or dict of {{Hashable : Iterable}}, + optional Additional strings to recognize as ``NA``/``NaN``. If ``dict`` passed, specific per-column ``NA`` values. By default the following values are interpreted as @@ -1565,7 +1565,7 @@ def read_table( Depending on whether ``na_values`` is passed in, the behavior is as follows: * If ``keep_default_na`` is ``True``, - and ``na_values`` are specified, ``na_values`` + and ``na_values`` are specified, ``na_values`` is appended to the default ``NaN`` values used for parsing. * If ``keep_default_na`` is ``True``, and ``na_values`` are not specified, only the default ``NaN`` values are used for parsing. @@ -1588,8 +1588,8 @@ def read_table( * ``bool``. If ``True`` -> try parsing the index. * ``None``. Behaves like ``True`` if ``date_format`` is specified. - * ``list`` of ``int`` or names. e.g. If ``[1, 2, 3]`` -> - try parsing columns 1, 2, 3 + * ``list`` of ``int`` or names. + e.g. If ``[1, 2, 3]`` -> try parsing columns 1, 2, 3 each as a separate date column. If a column or index cannot be represented as an array of ``datetime``, @@ -1666,7 +1666,7 @@ def read_table( Character used to denote the start and end of a quoted item. Quoted items can include the ``delimiter`` and it will be ignored. quoting : {{0 or csv.QUOTE_MINIMAL, 1 or csv.QUOTE_ALL, 2 or - csv.QUOTE_NONNUMERIC, 3 or csv.QUOTE_NONE}}, default csv.QUOTE_MINIMAL + csv.QUOTE_NONNUMERIC, 3 or csv.QUOTE_NONE}}, default csv.QUOTE_MINIMAL Control field quoting behavior per ``csv.QUOTE_*`` constants. Default is ``csv.QUOTE_MINIMAL`` (i.e., 0) which implies that only fields containing special From 8601d0f3b8e35817beafdd7d5fc7700224dd1cb3 Mon Sep 17 00:00:00 2001 From: huhu-dsy Date: Fri, 17 Oct 2025 02:44:28 +0800 Subject: [PATCH 16/24] Update readers.py --- pandas/io/parsers/readers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 9fdb1e22c9cd2..3239e99ace896 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -845,7 +845,7 @@ def read_csv( In addition, separators longer than 1 character and different from ``'\\s+'`` will be interpreted as regular expressions and will also force the use of the Python parsing engine. Note that regex delimiters are prone - to ignoring quoted data. Regex example: ``'\r\t'``. + to ignoring quoted data. Regex example: ``'\r '``. delimiter : str, optional Alias for ``sep``. header : int, Sequence of int, 'infer' or None, default 'infer' @@ -1433,7 +1433,7 @@ def read_table( In addition, separators longer than 1 character and different from ``'\\s+'`` will be interpreted as regular expressions and will also force the use of the Python parsing engine. Note that regex delimiters are prone - to ignoring quoted data. Regex example: ``'\r\t'``. + to ignoring quoted data. Regex example: ``'\r '``. delimiter : str, optional Alias for ``sep``. header : int, Sequence of int, 'infer' or None, default 'infer' From 2aaefd4d2d1baedf0b81fd65814bd1205bb9e0b0 Mon Sep 17 00:00:00 2001 From: huhu-dsy Date: Fri, 17 Oct 2025 02:48:06 +0800 Subject: [PATCH 17/24] Update readers.py --- pandas/io/parsers/readers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 3239e99ace896..a8774200f08d7 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1191,7 +1191,7 @@ def read_csv( * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` * ``"pyarrow"``: returns - pyarrow-backed nullable :class:`ArrowDtype` :class:`DataFrame` + pyarrow-backed nullable :class:`ArrowDtype` :class:`DataFrame` .. versionadded:: 2.0 @@ -1775,7 +1775,7 @@ def read_table( * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` * ``"pyarrow"``: returns pyarrow-backed nullable - :class:`ArrowDtype` :class:`DataFrame` + :class:`ArrowDtype` :class:`DataFrame` .. versionadded:: 2.0 From bb2c61a40bf3e8098d6552fe09830f91caf438f5 Mon Sep 17 00:00:00 2001 From: huhu-dsy Date: Fri, 17 Oct 2025 03:06:16 +0800 Subject: [PATCH 18/24] Update readers.py --- pandas/io/parsers/readers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index a8774200f08d7..6b75d426d994d 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -845,7 +845,7 @@ def read_csv( In addition, separators longer than 1 character and different from ``'\\s+'`` will be interpreted as regular expressions and will also force the use of the Python parsing engine. Note that regex delimiters are prone - to ignoring quoted data. Regex example: ``'\r '``. + to ignoring quoted data. Regex example: ``'\\r\\t'``. delimiter : str, optional Alias for ``sep``. header : int, Sequence of int, 'infer' or None, default 'infer' @@ -1433,7 +1433,7 @@ def read_table( In addition, separators longer than 1 character and different from ``'\\s+'`` will be interpreted as regular expressions and will also force the use of the Python parsing engine. Note that regex delimiters are prone - to ignoring quoted data. Regex example: ``'\r '``. + to ignoring quoted data. Regex example: ``'\\r\\t'``. delimiter : str, optional Alias for ``sep``. header : int, Sequence of int, 'infer' or None, default 'infer' From 8828ad5d69f87d2cf00c41933500ed915e82b5c5 Mon Sep 17 00:00:00 2001 From: huhu-dsy Date: Fri, 24 Oct 2025 10:49:37 +0800 Subject: [PATCH 19/24] Update readers.py --- pandas/io/parsers/readers.py | 451 ++--------------------------------- 1 file changed, 14 insertions(+), 437 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 6b75d426d994d..7d710aba42150 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -144,418 +144,6 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): _read_shared = dict -_doc_read_csv_and_table = r""" -{summary} - -Also supports optionally iterating or breaking of the file -into chunks. - -Additional help can be found in the online docs for -`IO Tools `_. - -Parameters ----------- -filepath_or_buffer : str, path object or file-like object - Any valid string path is acceptable. The string could be a URL. Valid - URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is - expected. A local file could be: file://localhost/path/to/table.csv. - - If you want to pass in a path object, pandas accepts any ``os.PathLike``. - - By file-like object, we refer to objects with a ``read()`` method, such as - a file handle (e.g. via builtin ``open`` function) or ``StringIO``. -sep : str, default {_default_sep} - Character or regex pattern to treat as the delimiter. If ``sep=None``, the - C engine cannot automatically detect - the separator, but the Python parsing engine can, meaning the latter will - be used and automatically detect the separator from only the first valid - row of the file by Python's builtin sniffer tool, ``csv.Sniffer``. - In addition, separators longer than 1 character and different from - ``'\s+'`` will be interpreted as regular expressions and will also force - the use of the Python parsing engine. Note that regex delimiters are prone - to ignoring quoted data. Regex example: ``'\r\t'``. -delimiter : str, optional - Alias for ``sep``. -header : int, Sequence of int, 'infer' or None, default 'infer' - Row number(s) containing column labels and marking the start of the - data (zero-indexed). Default behavior is to infer the column names: if no ``names`` - are passed the behavior is identical to ``header=0`` and column - names are inferred from the first line of the file, if column - names are passed explicitly to ``names`` then the behavior is identical to - ``header=None``. Explicitly pass ``header=0`` to be able to - replace existing names. The header can be a list of integers that - specify row locations for a :class:`~pandas.MultiIndex` on the columns - e.g. ``[0, 1, 3]``. Intervening rows that are not specified will be - skipped (e.g. 2 in this example is skipped). Note that this - parameter ignores commented lines and empty lines if - ``skip_blank_lines=True``, so ``header=0`` denotes the first line of - data rather than the first line of the file. - - When inferred from the file contents, headers are kept distinct from - each other by renaming duplicate names with a numeric suffix of the form - ``".{{count}}"`` starting from 1, e.g. ``"foo"`` and ``"foo.1"``. - Empty headers are named ``"Unnamed: {{i}}"`` or ``"Unnamed: {{i}}_level_{{level}}"`` - in the case of MultiIndex columns. -names : Sequence of Hashable, optional - Sequence of column labels to apply. If the file contains a header row, - then you should explicitly pass ``header=0`` to override the column names. - Duplicates in this list are not allowed. -index_col : Hashable, Sequence of Hashable or False, optional - Column(s) to use as row label(s), denoted either by column labels or column - indices. If a sequence of labels or indices is given, :class:`~pandas.MultiIndex` - will be formed for the row labels. - - Note: ``index_col=False`` can be used to force pandas to *not* use the first - column as the index, e.g., when you have a malformed file with delimiters at - the end of each line. -usecols : Sequence of Hashable or Callable, optional - Subset of columns to select, denoted either by column labels or column indices. - If list-like, all elements must either - be positional (i.e. integer indices into the document columns) or strings - that correspond to column names provided either by the user in ``names`` or - inferred from the document header row(s). If ``names`` are given, the document - header row(s) are not taken into account. For example, a valid list-like - ``usecols`` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``. - Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. - To instantiate a :class:`~pandas.DataFrame` from ``data`` with element order - preserved use ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` - for columns in ``['foo', 'bar']`` order or - ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]`` - for ``['bar', 'foo']`` order. - - If callable, the callable function will be evaluated against the column - names, returning names where the callable function evaluates to ``True``. An - example of a valid callable argument would be ``lambda x: x.upper() in - ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster - parsing time and lower memory usage. -dtype : dtype or dict of {{Hashable : dtype}}, optional - Data type(s) to apply to either the whole dataset or individual columns. - E.g., ``{{'a': np.float64, 'b': np.int32, 'c': 'Int64'}}`` - Use ``str`` or ``object`` together with suitable ``na_values`` settings - to preserve and not interpret ``dtype``. - If ``converters`` are specified, they will be applied INSTEAD - of ``dtype`` conversion. - - .. versionadded:: 1.5.0 - - Support for ``defaultdict`` was added. Specify a ``defaultdict`` as input where - the default determines the ``dtype`` of the columns which are not explicitly - listed. -engine : {{'c', 'python', 'pyarrow'}}, optional - Parser engine to use. The C and pyarrow engines are faster, while the python engine - is currently more feature-complete. Multithreading is currently only supported by - the pyarrow engine. - - .. versionadded:: 1.4.0 - - The 'pyarrow' engine was added as an *experimental* engine, and some features - are unsupported, or may not work correctly, with this engine. -converters : dict of {{Hashable : Callable}}, optional - Functions for converting values in specified columns. Keys can either - be column labels or column indices. -true_values : list, optional - Values to consider as ``True`` in addition to case-insensitive variants of 'True'. -false_values : list, optional - Values to consider as ``False`` in addition to case-insensitive variants of 'False'. -skipinitialspace : bool, default False - Skip spaces after delimiter. -skiprows : int, list of int or Callable, optional - Line numbers to skip (0-indexed) or number of lines to skip (``int``) - at the start of the file. - - If callable, the callable function will be evaluated against the row - indices, returning ``True`` if the row should be skipped and ``False`` otherwise. - An example of a valid callable argument would be ``lambda x: x in [0, 2]``. -skipfooter : int, default 0 - Number of lines at bottom of file to skip (Unsupported with ``engine='c'``). -nrows : int, optional - Number of rows of file to read. Useful for reading pieces of large files. - Refers to the number of data rows in the returned DataFrame, excluding: - - * The header row containing column names. - * Rows before the header row, if ``header=1`` or larger. - - Example usage: - - * To read the first 999,999 (non-header) rows: - ``read_csv(..., nrows=999999)`` - - * To read rows 1,000,000 through 1,999,999: - ``read_csv(..., skiprows=1000000, nrows=999999)`` -na_values : Hashable, Iterable of Hashable or dict of {{Hashable : Iterable}}, optional - Additional strings to recognize as ``NA``/``NaN``. If ``dict`` passed, specific - per-column ``NA`` values. By default the following values are interpreted as - ``NaN``: "{na_values_str}". -keep_default_na : bool, default True - Whether or not to include the default ``NaN`` values when parsing the data. - Depending on whether ``na_values`` is passed in, the behavior is as follows: - - * If ``keep_default_na`` is ``True``, and ``na_values`` are specified, ``na_values`` - is appended to the default ``NaN`` values used for parsing. - * If ``keep_default_na`` is ``True``, and ``na_values`` are not specified, only - the default ``NaN`` values are used for parsing. - * If ``keep_default_na`` is ``False``, and ``na_values`` are specified, only - the ``NaN`` values specified ``na_values`` are used for parsing. - * If ``keep_default_na`` is ``False``, and ``na_values`` are not specified, no - strings will be parsed as ``NaN``. - - Note that if ``na_filter`` is passed in as ``False``, the ``keep_default_na`` and - ``na_values`` parameters will be ignored. -na_filter : bool, default True - Detect missing value markers (empty strings and the value of ``na_values``). In - data without any ``NA`` values, passing ``na_filter=False`` can improve the - performance of reading a large file. -skip_blank_lines : bool, default True - If ``True``, skip over blank lines rather than interpreting as ``NaN`` values. -parse_dates : bool, None, list of Hashable, default None - The behavior is as follows: - - * ``bool``. If ``True`` -> try parsing the index. - * ``None``. Behaves like ``True`` if ``date_format`` is specified. - * ``list`` of ``int`` or names. e.g. If ``[1, 2, 3]`` -> try parsing columns 1, 2, 3 - each as a separate date column. - - If a column or index cannot be represented as an array of ``datetime``, - say because of an unparsable value or a mixture of timezones, the column - or index will be returned unaltered as an ``object`` data type. For - non-standard ``datetime`` parsing, use :func:`~pandas.to_datetime` after - :func:`~pandas.read_csv`. - - Note: A fast-path exists for iso8601-formatted dates. -date_format : str or dict of column -> format, optional - Format to use for parsing dates and/or times when used in conjunction with ``parse_dates``. - The strftime to parse time, e.g. :const:`"%d/%m/%Y"`. See - `strftime documentation - `_ for more information on choices, though - note that :const:`"%f"` will parse all the way up to nanoseconds. - You can also pass: - - - "ISO8601", to parse any `ISO8601 `_ - time string (not necessarily in exactly the same format); - - "mixed", to infer the format for each element individually. This is risky, - and you should probably use it along with `dayfirst`. - - .. versionadded:: 2.0.0 -dayfirst : bool, default False - DD/MM format dates, international and European format. -cache_dates : bool, default True - If ``True``, use a cache of unique, converted dates to apply the ``datetime`` - conversion. May produce significant speed-up when parsing duplicate - date strings, especially ones with timezone offsets. - -iterator : bool, default False - Return ``TextFileReader`` object for iteration or getting chunks with - ``get_chunk()``. -chunksize : int, optional - Number of lines to read from the file per chunk. Passing a value will cause the - function to return a ``TextFileReader`` object for iteration. - See the `IO Tools docs - `_ - for more information on ``iterator`` and ``chunksize``. - -{decompression_options} - - .. versionchanged:: 1.4.0 Zstandard support. - -thousands : str (length 1), optional - Character acting as the thousands separator in numerical values. -decimal : str (length 1), default '.' - Character to recognize as decimal point (e.g., use ',' for European data). -lineterminator : str (length 1), optional - Character used to denote a line break. Only valid with C parser. -quotechar : str (length 1), optional - Character used to denote the start and end of a quoted item. Quoted - items can include the ``delimiter`` and it will be ignored. -quoting : {{0 or csv.QUOTE_MINIMAL, 1 or csv.QUOTE_ALL, 2 or csv.QUOTE_NONNUMERIC, 3 or csv.QUOTE_NONE}}, default csv.QUOTE_MINIMAL - Control field quoting behavior per ``csv.QUOTE_*`` constants. Default is - ``csv.QUOTE_MINIMAL`` (i.e., 0) which implies that only fields containing special - characters are quoted (e.g., characters defined in ``quotechar``, ``delimiter``, - or ``lineterminator``. -doublequote : bool, default True - When ``quotechar`` is specified and ``quoting`` is not ``QUOTE_NONE``, indicate - whether or not to interpret two consecutive ``quotechar`` elements INSIDE a - field as a single ``quotechar`` element. -escapechar : str (length 1), optional - Character used to escape other characters. -comment : str (length 1), optional - Character indicating that the remainder of line should not be parsed. - If found at the beginning - of a line, the line will be ignored altogether. This parameter must be a - single character. Like empty lines (as long as ``skip_blank_lines=True``), - fully commented lines are ignored by the parameter ``header`` but not by - ``skiprows``. For example, if ``comment='#'``, parsing - ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in ``'a,b,c'`` being - treated as the header. -encoding : str, optional, default 'utf-8' - Encoding to use for UTF when reading/writing (ex. ``'utf-8'``). `List of Python - standard encodings - `_ . - -encoding_errors : str, optional, default 'strict' - How encoding errors are treated. `List of possible values - `_ . - - .. versionadded:: 1.3.0 - -dialect : str or csv.Dialect, optional - If provided, this parameter will override values (default or not) for the - following parameters: ``delimiter``, ``doublequote``, ``escapechar``, - ``skipinitialspace``, ``quotechar``, and ``quoting``. If it is necessary to - override values, a ``ParserWarning`` will be issued. See ``csv.Dialect`` - documentation for more details. -on_bad_lines : {{'error', 'warn', 'skip'}} or Callable, default 'error' - Specifies what to do upon encountering a bad line (a line with too many fields). - Allowed values are: - - - ``'error'``, raise an Exception when a bad line is encountered. - - ``'warn'``, raise a warning when a bad line is encountered and skip that line. - - ``'skip'``, skip bad lines without raising or warning when they are encountered. - - Callable, function that will process a single bad line. - - With ``engine='python'``, function with signature - ``(bad_line: list[str]) -> list[str] | None``. - ``bad_line`` is a list of strings split by the ``sep``. - If the function returns ``None``, the bad line will be ignored. - If the function returns a new ``list`` of strings with more elements than - expected, a ``ParserWarning`` will be emitted while dropping extra elements. - - With ``engine='pyarrow'``, function with signature - as described in pyarrow documentation: `invalid_row_handler - `_. - - .. versionadded:: 1.3.0 - - .. versionadded:: 1.4.0 - - Callable - - .. versionchanged:: 2.2.0 - - Callable for ``engine='pyarrow'`` - -low_memory : bool, default True - Internally process the file in chunks, resulting in lower memory use - while parsing, but possibly mixed type inference. To ensure no mixed - types either set ``False``, or specify the type with the ``dtype`` parameter. - Note that the entire file is read into a single :class:`~pandas.DataFrame` - regardless, use the ``chunksize`` or ``iterator`` parameter to return the data in - chunks. (Only valid with C parser). -memory_map : bool, default False - If a filepath is provided for ``filepath_or_buffer``, map the file object - directly onto memory and access the data directly from there. Using this - option can improve performance because there is no longer any I/O overhead. -float_precision : {{'high', 'legacy', 'round_trip'}}, optional - Specifies which converter the C engine should use for floating-point - values. The options are ``None`` or ``'high'`` for the ordinary converter, - ``'legacy'`` for the original lower precision pandas converter, and - ``'round_trip'`` for the round-trip converter. - -{storage_options} - -dtype_backend : {{'numpy_nullable', 'pyarrow'}} - Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). If not specified, the default behavior - is to not use nullable data types. If specified, the behavior - is as follows: - - * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` :class:`DataFrame` - - .. versionadded:: 2.0 - -Returns -------- -DataFrame or TextFileReader - A comma-separated values (csv) file is returned as two-dimensional - data structure with labeled axes. - -See Also --------- -DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file. -{see_also_func_name} : {see_also_func_summary} -read_fwf : Read a table of fixed-width formatted lines into DataFrame. - -Examples --------- ->>> pd.{func_name}('data.csv') # doctest: +SKIP - Name Value -0 foo 1 -1 bar 2 -2 #baz 3 - -Index and header can be specified via the `index_col` and `header` arguments. - ->>> pd.{func_name}('data.csv', header=None) # doctest: +SKIP - 0 1 -0 Name Value -1 foo 1 -2 bar 2 -3 #baz 3 - ->>> pd.{func_name}('data.csv', index_col='Value') # doctest: +SKIP - Name -Value -1 foo -2 bar -3 #baz - -Column types are inferred but can be explicitly specified using the dtype argument. - ->>> pd.{func_name}('data.csv', dtype={{'Value': float}}) # doctest: +SKIP - Name Value -0 foo 1.0 -1 bar 2.0 -2 #baz 3.0 - -True, False, and NA values, and thousands separators have defaults, -but can be explicitly specified, too. Supply the values you would like -as strings or lists of strings! - ->>> pd.{func_name}('data.csv', na_values=['foo', 'bar']) # doctest: +SKIP - Name Value -0 NaN 1 -1 NaN 2 -2 #baz 3 - -Comment lines in the input file can be skipped using the `comment` argument. - ->>> pd.{func_name}('data.csv', comment='#') # doctest: +SKIP - Name Value -0 foo 1 -1 bar 2 - -By default, columns with dates will be read as ``object`` rather than ``datetime``. - ->>> df = pd.{func_name}('tmp.csv') # doctest: +SKIP - ->>> df # doctest: +SKIP - col 1 col 2 col 3 -0 10 10/04/2018 Sun 15 Jan 2023 -1 20 15/04/2018 Fri 12 May 2023 - ->>> df.dtypes # doctest: +SKIP -col 1 int64 -col 2 object -col 3 object -dtype: object - -Specific columns can be parsed as dates by using the `parse_dates` and -`date_format` arguments. - ->>> df = pd.{func_name}( -... 'tmp.csv', -... parse_dates=[1, 2], -... date_format={{'col 2': '%d/%m/%Y', 'col 3': '%a %d %b %Y'}}, -... ) # doctest: +SKIP - ->>> df.dtypes # doctest: +SKIP -col 1 int64 -col 2 datetime64[ns] -col 3 datetime64[ns] -dtype: object -""" # noqa: E501 - - class _C_Parser_Defaults(TypedDict): na_filter: Literal[True] low_memory: Literal[True] @@ -747,7 +335,6 @@ def read_csv( **kwds: Unpack[_read_shared[HashableT]], ) -> DataFrame: ... - @overload def read_csv( filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], @@ -875,14 +462,14 @@ def read_csv( then you should explicitly pass ``header=0`` to override the column names. Duplicates in this list are not allowed. index_col : Hashable, Sequence of Hashable or False, optional - Column(s) to use as row label(s), denoted either by column labels or column - indices. If a sequence of labels or indices is given, - :class:`~pandas.MultiIndex` - will be formed for the row labels. - - Note: ``index_col=False`` can be used to force pandas to *not* use the first - column as the index, e.g., when you have a malformed file with delimiters at - the end of each line. + Column(s) to use as row label(s), denoted either by column labels or column + indices. If a sequence of labels or indices is given, + :class:`~pandas.MultiIndex` + will be formed for the row labels. + + Note: ``index_col=False`` can be used to force pandas to *not* use the first + column as the index, e.g., when you have a malformed file with delimiters at + the end of each line. usecols : Sequence of Hashable or Callable, optional Subset of columns to select, denoted either by column labels or column indices. @@ -1145,12 +732,6 @@ def read_csv( /generated/pyarrow.csv.ParseOptions.html #pyarrow.csv.ParseOptions.invalid_row_handler>`_. - .. versionadded:: 1.3.0 - - .. versionadded:: 1.4.0 - - Callable - .. versionchanged:: 2.2.0 Callable for ``engine='pyarrow'`` @@ -1463,13 +1044,13 @@ def read_table( then you should explicitly pass ``header=0`` to override the column names. Duplicates in this list are not allowed. index_col : Hashable, Sequence of Hashable or False, optional - Column(s) to use as row label(s), denoted either by column labels or column - indices. If a sequence of labels or indices is given, :class:`~pandas.MultiIndex` - will be formed for the row labels. + Column(s) to use as row label(s), denoted either by column labels or column + indices. If a sequence of labels or indices is given, :class:`~pandas.MultiIndex` + will be formed for the row labels. - Note: ``index_col=False`` can be used to force pandas to *not* use the first - column as the index, e.g., when you have a malformed file with delimiters at - the end of each line. + Note: ``index_col=False`` can be used to force pandas to *not* use the first + column as the index, e.g., when you have a malformed file with delimiters at + the end of each line. usecols : Sequence of Hashable or Callable, optional Subset of columns to select, denoted either by column labels or column indices. If list-like, all elements must either @@ -1735,10 +1316,6 @@ def read_table( Callable - .. versionchanged:: 2.2.0 - - Callable for ``engine='pyarrow'`` - low_memory : bool, default True Internally process the file in chunks, resulting in lower memory use while parsing, but possibly mixed type inference. To ensure no mixed From 4f0ad97a6cb3da28c95cf46d3dfd537e7101eefa Mon Sep 17 00:00:00 2001 From: huhu-dsy Date: Fri, 24 Oct 2025 11:01:10 +0800 Subject: [PATCH 20/24] Update readers.py --- pandas/io/parsers/readers.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 7d710aba42150..c2eed7ef2ee20 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -335,6 +335,7 @@ def read_csv( **kwds: Unpack[_read_shared[HashableT]], ) -> DataFrame: ... + @overload def read_csv( filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], @@ -1045,7 +1046,8 @@ def read_table( Duplicates in this list are not allowed. index_col : Hashable, Sequence of Hashable or False, optional Column(s) to use as row label(s), denoted either by column labels or column - indices. If a sequence of labels or indices is given, :class:`~pandas.MultiIndex` + indices. If a sequence of labels or indices is given, + :class:`~pandas.MultiIndex` will be formed for the row labels. Note: ``index_col=False`` can be used to force pandas to *not* use the first From b864687e0d08f483b0564ed553c3cb6506e689f9 Mon Sep 17 00:00:00 2001 From: huhu-dsy Date: Fri, 24 Oct 2025 12:11:46 +0800 Subject: [PATCH 21/24] Update readers.py --- pandas/io/parsers/readers.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index c2eed7ef2ee20..8e4af79e5a462 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1312,11 +1312,9 @@ def read_table( python/generated/pyarrow.csv.ParseOptions.html #pyarrow.csv.ParseOptions.invalid_row_handler>`_. - .. versionadded:: 1.3.0 - - .. versionadded:: 1.4.0 + .. versionadded:: 2.2.0 - Callable + Callable for ``engine='pyarrow'`` low_memory : bool, default True Internally process the file in chunks, resulting in lower memory use From 8dd57ed07fb70c4b798abc09700b01348167a5cc Mon Sep 17 00:00:00 2001 From: huhu-dsy Date: Fri, 24 Oct 2025 14:08:19 +0800 Subject: [PATCH 22/24] Update readers.py --- pandas/io/parsers/readers.py | 47 ++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 8e4af79e5a462..6d8d7825b96bb 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -463,14 +463,14 @@ def read_csv( then you should explicitly pass ``header=0`` to override the column names. Duplicates in this list are not allowed. index_col : Hashable, Sequence of Hashable or False, optional - Column(s) to use as row label(s), denoted either by column labels or column - indices. If a sequence of labels or indices is given, - :class:`~pandas.MultiIndex` - will be formed for the row labels. - - Note: ``index_col=False`` can be used to force pandas to *not* use the first - column as the index, e.g., when you have a malformed file with delimiters at - the end of each line. + Column(s) to use as row label(s), denoted either by column labels or column + indices. If a sequence of labels or indices is given, + :class:`~pandas.MultiIndex` + will be formed for the row labels. + + Note: ``index_col=False`` can be used to force pandas to *not* use the first + column as the index, e.g., when you have a malformed file with delimiters at + the end of each line. usecols : Sequence of Hashable or Callable, optional Subset of columns to select, denoted either by column labels or column indices. @@ -733,6 +733,12 @@ def read_csv( /generated/pyarrow.csv.ParseOptions.html #pyarrow.csv.ParseOptions.invalid_row_handler>`_. + .. versionadded:: 1.3.0 + + .. versionadded:: 1.4.0 + + Callable + .. versionchanged:: 2.2.0 Callable for ``engine='pyarrow'`` @@ -1045,14 +1051,13 @@ def read_table( then you should explicitly pass ``header=0`` to override the column names. Duplicates in this list are not allowed. index_col : Hashable, Sequence of Hashable or False, optional - Column(s) to use as row label(s), denoted either by column labels or column - indices. If a sequence of labels or indices is given, - :class:`~pandas.MultiIndex` - will be formed for the row labels. - - Note: ``index_col=False`` can be used to force pandas to *not* use the first - column as the index, e.g., when you have a malformed file with delimiters at - the end of each line. + Column(s) to use as row label(s), denoted either by column labels or column + indices. If a sequence of labels or indices is given, :class:`~pandas.MultiIndex` + will be formed for the row labels. + + Note: ``index_col=False`` can be used to force pandas to *not* use the first + column as the index, e.g., when you have a malformed file with delimiters at + the end of each line. usecols : Sequence of Hashable or Callable, optional Subset of columns to select, denoted either by column labels or column indices. If list-like, all elements must either @@ -1312,7 +1317,13 @@ def read_table( python/generated/pyarrow.csv.ParseOptions.html #pyarrow.csv.ParseOptions.invalid_row_handler>`_. - .. versionadded:: 2.2.0 + .. versionadded:: 1.3.0 + + .. versionadded:: 1.4.0 + + Callable + + .. versionchanged:: 2.2.0 Callable for ``engine='pyarrow'`` @@ -2417,4 +2428,4 @@ def _validate_skipfooter(kwds: dict[str, Any]) -> None: if kwds.get("iterator") or kwds.get("chunksize"): raise ValueError("'skipfooter' not supported for iteration") if kwds.get("nrows"): - raise ValueError("'skipfooter' not supported with 'nrows'") + raise ValueError("'skipfooter' not supported with 'nrows'") \ No newline at end of file From 2c2adb941c9d670c5b16a4a48643037094f93043 Mon Sep 17 00:00:00 2001 From: huhu-dsy Date: Fri, 24 Oct 2025 14:14:18 +0800 Subject: [PATCH 23/24] Update readers.py --- pandas/io/parsers/readers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 6d8d7825b96bb..cb875ea80ea3c 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -2428,4 +2428,4 @@ def _validate_skipfooter(kwds: dict[str, Any]) -> None: if kwds.get("iterator") or kwds.get("chunksize"): raise ValueError("'skipfooter' not supported for iteration") if kwds.get("nrows"): - raise ValueError("'skipfooter' not supported with 'nrows'") \ No newline at end of file + raise ValueError("'skipfooter' not supported with 'nrows'") From af5d1ce7260b6c24fe1f12070a4d8a36e3ef62d0 Mon Sep 17 00:00:00 2001 From: huhu-dsy Date: Fri, 24 Oct 2025 15:02:36 +0800 Subject: [PATCH 24/24] Update readers.py --- pandas/io/parsers/readers.py | 45 ++++++++++++++---------------------- 1 file changed, 17 insertions(+), 28 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index cb875ea80ea3c..8e4af79e5a462 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -463,14 +463,14 @@ def read_csv( then you should explicitly pass ``header=0`` to override the column names. Duplicates in this list are not allowed. index_col : Hashable, Sequence of Hashable or False, optional - Column(s) to use as row label(s), denoted either by column labels or column - indices. If a sequence of labels or indices is given, - :class:`~pandas.MultiIndex` - will be formed for the row labels. - - Note: ``index_col=False`` can be used to force pandas to *not* use the first - column as the index, e.g., when you have a malformed file with delimiters at - the end of each line. + Column(s) to use as row label(s), denoted either by column labels or column + indices. If a sequence of labels or indices is given, + :class:`~pandas.MultiIndex` + will be formed for the row labels. + + Note: ``index_col=False`` can be used to force pandas to *not* use the first + column as the index, e.g., when you have a malformed file with delimiters at + the end of each line. usecols : Sequence of Hashable or Callable, optional Subset of columns to select, denoted either by column labels or column indices. @@ -733,12 +733,6 @@ def read_csv( /generated/pyarrow.csv.ParseOptions.html #pyarrow.csv.ParseOptions.invalid_row_handler>`_. - .. versionadded:: 1.3.0 - - .. versionadded:: 1.4.0 - - Callable - .. versionchanged:: 2.2.0 Callable for ``engine='pyarrow'`` @@ -1051,13 +1045,14 @@ def read_table( then you should explicitly pass ``header=0`` to override the column names. Duplicates in this list are not allowed. index_col : Hashable, Sequence of Hashable or False, optional - Column(s) to use as row label(s), denoted either by column labels or column - indices. If a sequence of labels or indices is given, :class:`~pandas.MultiIndex` - will be formed for the row labels. - - Note: ``index_col=False`` can be used to force pandas to *not* use the first - column as the index, e.g., when you have a malformed file with delimiters at - the end of each line. + Column(s) to use as row label(s), denoted either by column labels or column + indices. If a sequence of labels or indices is given, + :class:`~pandas.MultiIndex` + will be formed for the row labels. + + Note: ``index_col=False`` can be used to force pandas to *not* use the first + column as the index, e.g., when you have a malformed file with delimiters at + the end of each line. usecols : Sequence of Hashable or Callable, optional Subset of columns to select, denoted either by column labels or column indices. If list-like, all elements must either @@ -1317,13 +1312,7 @@ def read_table( python/generated/pyarrow.csv.ParseOptions.html #pyarrow.csv.ParseOptions.invalid_row_handler>`_. - .. versionadded:: 1.3.0 - - .. versionadded:: 1.4.0 - - Callable - - .. versionchanged:: 2.2.0 + .. versionadded:: 2.2.0 Callable for ``engine='pyarrow'``