apache
diff --git a/‎benchmarks/db-benchmark/groupby-datafusion.py‎
Lines changed: 3 additions & 2 deletions b/‎benchmarks/db-benchmark/groupby-datafusion.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎benchmarks/db-benchmark/join-datafusion.py‎
Lines changed: 6 additions & 5 deletions b/‎benchmarks/db-benchmark/join-datafusion.py‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎benchmarks/tpch/tpch.py‎
Lines changed: 4 additions & 3 deletions b/‎benchmarks/tpch/tpch.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎dev/create_license.py‎
Lines changed: 2 additions & 1 deletion b/‎dev/create_license.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎dev/release/check-rat-report.py‎
Lines changed: 2 additions & 1 deletion b/‎dev/release/check-rat-report.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/python-udf-comparisons.py‎
Lines changed: 3 additions & 3 deletions b/‎examples/python-udf-comparisons.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎examples/tpch/convert_data_to_parquet.py‎
Lines changed: 4 additions & 6 deletions b/‎examples/tpch/convert_data_to_parquet.py‎
Lines changed: 4 additions & 6 deletions
diff --git a/‎examples/tpch/q07_volume_shipping.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/tpch/q07_volume_shipping.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/tpch/q12_ship_mode_order_priority.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/tpch/q12_ship_mode_order_priority.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/tpch/util.py‎
Lines changed: 7 additions & 9 deletions b/‎examples/tpch/util.py‎
Lines changed: 7 additions & 9 deletions
@@ -18,6 +18,7 @@
 import gc
 import os
 import timeit
+from pathlib import Path
 
 import datafusion as df
 import pyarrow as pa
@@ -34,7 +35,7 @@
 
 print("# groupby-datafusion.py", flush=True)
 
-exec(open("./_helpers/helpers.py").read())
+exec(Path.open("./_helpers/helpers.py").read())
 
 
 def ans_shape(batches) -> tuple[int, int]:
@@ -65,7 +66,7 @@ def execute(df) -> list:
 sql = True
 
 data_name = os.environ["SRC_DATANAME"]
-src_grp = os.path.join("data", data_name + ".csv")
+src_grp = "data" / data_name / ".csv"
 print("loading dataset %s" % src_grp, flush=True)
 
 schema = pa.schema(
 
@@ -18,6 +18,7 @@
 import gc
 import os
 import timeit
+from pathlib import Path
 
 import datafusion as df
 from datafusion import col
@@ -26,7 +27,7 @@
 
 print("# join-datafusion.py", flush=True)
 
-exec(open("./_helpers/helpers.py").read())
+exec(Path.open("./_helpers/helpers.py").read())
 
 
 def ans_shape(batches) -> tuple[int, int]:
@@ -49,12 +50,12 @@ def ans_shape(batches) -> tuple[int, int]:
 on_disk = "FALSE"
 
 data_name = os.environ["SRC_DATANAME"]
-src_jn_x = os.path.join("data", data_name + ".csv")
+src_jn_x = "data" / data_name / ".csv"
 y_data_name = join_to_tbls(data_name)
 src_jn_y = [
-    os.path.join("data", y_data_name[0] + ".csv"),
-    os.path.join("data", y_data_name[1] + ".csv"),
-    os.path.join("data", y_data_name[2] + ".csv"),
+    "data" / y_data_name[0] / ".csv",
+    "data" / y_data_name[1] / ".csv",
+    "data" / y_data_name[2] / ".csv",
 ]
 if len(src_jn_y) != 3:
     error_msg = "Something went wrong in preparing files used for join"
 
@@ -17,12 +17,13 @@
 
 import argparse
 import time
+from pathlib import Path
 
 from datafusion import SessionContext
 
 
 def bench(data_path, query_path) -> None:
-    with open("results.csv", "w") as results:
+    with Path.open("results.csv", "w") as results:
         # register tables
         start = time.time()
         total_time_millis = 0
@@ -45,7 +46,7 @@ def bench(data_path, query_path) -> None:
         print("Configuration:\n", ctx)
 
         # register tables
-        with open("create_tables.sql") as f:
+        with Path.open("create_tables.sql") as f:
             sql = ""
             for line in f.readlines():
                 if line.startswith("--"):
@@ -65,7 +66,7 @@ def bench(data_path, query_path) -> None:
 
         # run queries
         for query in range(1, 23):
-            with open(f"{query_path}/q{query}.sql") as f:
+            with Path.open(f"{query_path}/q{query}.sql") as f:
                 text = f.read()
                 tmp = text.split(";")
                 queries = [s.strip() for s in tmp if len(s.strip()) > 0]
 
@@ -20,6 +20,7 @@
 
 import json
 import subprocess
+from pathlib import Path
 
 subprocess.check_output(["cargo", "install", "cargo-license"])
 data = subprocess.check_output(
@@ -248,5 +249,5 @@
     result += "------------------\n\n"
     result += f"### {name} {version}\n* source: [{repository}]({repository})\n* license: {license}\n\n"
 
-with open("LICENSE.txt", "w") as f:
+with Path.open("LICENSE.txt", "w") as f:
     f.write(result)
@@ -21,6 +21,7 @@
 import re
 import sys
 import xml.etree.ElementTree as ET
+from pathlib import Path
 
 if len(sys.argv) != 3:
     sys.stderr.write("Usage: %s exclude_globs.lst rat_report.xml\n" % sys.argv[0])
@@ -29,7 +30,7 @@
 exclude_globs_filename = sys.argv[1]
 xml_filename = sys.argv[2]
 
-globs = [line.strip() for line in open(exclude_globs_filename)]
+globs = [line.strip() for line in Path.open(exclude_globs_filename)]
 
 tree = ET.parse(xml_filename)
 root = tree.getroot()
 
@@ -15,16 +15,16 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import os
 import time
+from pathlib import Path
 
 import pyarrow as pa
 import pyarrow.compute as pc
 from datafusion import SessionContext, col, lit, udf
 from datafusion import functions as F
 
-path = os.path.dirname(os.path.abspath(__file__))
-filepath = os.path.join(path, "./tpch/data/lineitem.parquet")
+path = Path(__file__).parent.resolve()
+filepath = path / "./tpch/data/lineitem.parquet"
 
 # This example serves to demonstrate alternate approaches to answering the
 # question "return all of the rows that have a specific combination of these
 
@@ -22,7 +22,7 @@
 as will be generated by the script provided in this repository.
 """
 
-import os
+from pathlib import Path
 
 import datafusion
 import pyarrow as pa
@@ -116,7 +116,7 @@
     ("S_COMMENT", pa.string()),
 ]
 
-curr_dir = os.path.dirname(os.path.abspath(__file__))
+curr_dir = Path(__file__).resolve().parent
 for filename, curr_schema_val in all_schemas.items():
     # For convenience, go ahead and convert the schema column names to lowercase
     curr_schema = [(s[0].lower(), s[1]) for s in curr_schema_val]
@@ -132,10 +132,8 @@
 
     schema = pa.schema(curr_schema)
 
-    source_file = os.path.abspath(
-        os.path.join(curr_dir, f"../../benchmarks/tpch/data/{filename}.csv")
-    )
-    dest_file = os.path.abspath(os.path.join(curr_dir, f"./data/{filename}.parquet"))
+    source_file = (curr_dir / f"../../benchmarks/tpch/data/{filename}.csv").resolve()
+    dest_file = (curr_dir / f"./data/{filename}.parquet").resolve()
 
     df = ctx.read_csv(source_file, schema=schema, has_header=False, delimiter="|")
 
 
@@ -80,7 +80,7 @@
 # not match these will result in a null value and then get filtered out.
 #
 # To do the same using a simple filter would be:
-# df_nation = df_nation.filter((F.col("n_name") == nation_1) | (F.col("n_name") == nation_2))
+# df_nation = df_nation.filter((F.col("n_name") == nation_1) | (F.col("n_name") == nation_2)) # noqa: ERA001
 df_nation = df_nation.with_column(
     "n_name",
     F.case(col("n_name"))
 
@@ -73,7 +73,7 @@
 # matches either of the two values, but we want to show doing some array operations in this
 # example. If you want to see this done with filters, comment out the above line and uncomment
 # this one.
-# df = df.filter((col("l_shipmode") == lit(SHIP_MODE_1)) | (col("l_shipmode") == lit(SHIP_MODE_2)))
+# df = df.filter((col("l_shipmode") == lit(SHIP_MODE_1)) | (col("l_shipmode") == lit(SHIP_MODE_2))) # noqa: ERA001
 
 
 # We need order priority, so join order df to line item
 
@@ -19,18 +19,16 @@
 Common utilities for running TPC-H examples.
 """
 
-import os
+from pathlib import Path
 
 
-def get_data_path(filename: str) -> str:
-    path = os.path.dirname(os.path.abspath(__file__))
+def get_data_path(filename: str) -> Path:
+    path = Path(__file__).resolve().parent
 
-    return os.path.join(path, "data", filename)
+    return path / "data" / filename
 
 
-def get_answer_file(answer_file: str) -> str:
-    path = os.path.dirname(os.path.abspath(__file__))
+def get_answer_file(answer_file: str) -> Path:
+    path = Path(__file__).resolve().parent
 
-    return os.path.join(
-        path, "../../benchmarks/tpch/data/answers", f"{answer_file}.out"
-    )
+    return path / "../../benchmarks/tpch/data/answers" / f"{answer_file}.out"
Original file line number	Diff line number	Diff line change
`@@ -80,7 +80,7 @@`
`80`	`80`	`# not match these will result in a null value and then get filtered out.`
`81`	`81`	`#`
`82`	`82`	`# To do the same using a simple filter would be:`
`83`		`-# df_nation = df_nation.filter((F.col("n_name") == nation_1) \| (F.col("n_name") == nation_2))`
	`83`	`+# df_nation = df_nation.filter((F.col("n_name") == nation_1) \| (F.col("n_name") == nation_2)) # noqa: ERA001`
`84`	`84`	`df_nation = df_nation.with_column(`
`85`	`85`	`"n_name",`
`86`	`86`	`F.case(col("n_name"))`