Skip to content

Commit d7e137e

Browse files
authored
Enable remaining pylints (#1298)
* Now that we are on Python 3.10 change from union and otional to | * Enable additional lint * Add check for dead code * Verify all python arguments ahve type annotations * Add return types on functions * Cleaning up pyproj * More lints * Enable path ruff check * Fix Path.glob code * Remove deprecated test * Expect deprecation warning
1 parent 51dc78a commit d7e137e

22 files changed

+216
-238
lines changed

benchmarks/db-benchmark/groupby-datafusion.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import gc
1919
import os
2020
import timeit
21+
from pathlib import Path
2122

2223
import datafusion as df
2324
import pyarrow as pa
@@ -34,7 +35,7 @@
3435

3536
print("# groupby-datafusion.py", flush=True)
3637

37-
exec(open("./_helpers/helpers.py").read())
38+
exec(Path.open("./_helpers/helpers.py").read())
3839

3940

4041
def ans_shape(batches) -> tuple[int, int]:
@@ -65,7 +66,7 @@ def execute(df) -> list:
6566
sql = True
6667

6768
data_name = os.environ["SRC_DATANAME"]
68-
src_grp = os.path.join("data", data_name + ".csv")
69+
src_grp = "data" / data_name / ".csv"
6970
print("loading dataset %s" % src_grp, flush=True)
7071

7172
schema = pa.schema(

benchmarks/db-benchmark/join-datafusion.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import gc
1919
import os
2020
import timeit
21+
from pathlib import Path
2122

2223
import datafusion as df
2324
from datafusion import col
@@ -26,7 +27,7 @@
2627

2728
print("# join-datafusion.py", flush=True)
2829

29-
exec(open("./_helpers/helpers.py").read())
30+
exec(Path.open("./_helpers/helpers.py").read())
3031

3132

3233
def ans_shape(batches) -> tuple[int, int]:
@@ -49,12 +50,12 @@ def ans_shape(batches) -> tuple[int, int]:
4950
on_disk = "FALSE"
5051

5152
data_name = os.environ["SRC_DATANAME"]
52-
src_jn_x = os.path.join("data", data_name + ".csv")
53+
src_jn_x = "data" / data_name / ".csv"
5354
y_data_name = join_to_tbls(data_name)
5455
src_jn_y = [
55-
os.path.join("data", y_data_name[0] + ".csv"),
56-
os.path.join("data", y_data_name[1] + ".csv"),
57-
os.path.join("data", y_data_name[2] + ".csv"),
56+
"data" / y_data_name[0] / ".csv",
57+
"data" / y_data_name[1] / ".csv",
58+
"data" / y_data_name[2] / ".csv",
5859
]
5960
if len(src_jn_y) != 3:
6061
error_msg = "Something went wrong in preparing files used for join"

benchmarks/tpch/tpch.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,13 @@
1717

1818
import argparse
1919
import time
20+
from pathlib import Path
2021

2122
from datafusion import SessionContext
2223

2324

2425
def bench(data_path, query_path) -> None:
25-
with open("results.csv", "w") as results:
26+
with Path.open("results.csv", "w") as results:
2627
# register tables
2728
start = time.time()
2829
total_time_millis = 0
@@ -45,7 +46,7 @@ def bench(data_path, query_path) -> None:
4546
print("Configuration:\n", ctx)
4647

4748
# register tables
48-
with open("create_tables.sql") as f:
49+
with Path.open("create_tables.sql") as f:
4950
sql = ""
5051
for line in f.readlines():
5152
if line.startswith("--"):
@@ -65,7 +66,7 @@ def bench(data_path, query_path) -> None:
6566

6667
# run queries
6768
for query in range(1, 23):
68-
with open(f"{query_path}/q{query}.sql") as f:
69+
with Path.open(f"{query_path}/q{query}.sql") as f:
6970
text = f.read()
7071
tmp = text.split(";")
7172
queries = [s.strip() for s in tmp if len(s.strip()) > 0]

dev/create_license.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020

2121
import json
2222
import subprocess
23+
from pathlib import Path
2324

2425
subprocess.check_output(["cargo", "install", "cargo-license"])
2526
data = subprocess.check_output(
@@ -248,5 +249,5 @@
248249
result += "------------------\n\n"
249250
result += f"### {name} {version}\n* source: [{repository}]({repository})\n* license: {license}\n\n"
250251

251-
with open("LICENSE.txt", "w") as f:
252+
with Path.open("LICENSE.txt", "w") as f:
252253
f.write(result)

dev/release/check-rat-report.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import re
2222
import sys
2323
import xml.etree.ElementTree as ET
24+
from pathlib import Path
2425

2526
if len(sys.argv) != 3:
2627
sys.stderr.write("Usage: %s exclude_globs.lst rat_report.xml\n" % sys.argv[0])
@@ -29,7 +30,7 @@
2930
exclude_globs_filename = sys.argv[1]
3031
xml_filename = sys.argv[2]
3132

32-
globs = [line.strip() for line in open(exclude_globs_filename)]
33+
globs = [line.strip() for line in Path.open(exclude_globs_filename)]
3334

3435
tree = ET.parse(xml_filename)
3536
root = tree.getroot()

examples/python-udf-comparisons.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,16 +15,16 @@
1515
# specific language governing permissions and limitations
1616
# under the License.
1717

18-
import os
1918
import time
19+
from pathlib import Path
2020

2121
import pyarrow as pa
2222
import pyarrow.compute as pc
2323
from datafusion import SessionContext, col, lit, udf
2424
from datafusion import functions as F
2525

26-
path = os.path.dirname(os.path.abspath(__file__))
27-
filepath = os.path.join(path, "./tpch/data/lineitem.parquet")
26+
path = Path(__file__).parent.resolve()
27+
filepath = path / "./tpch/data/lineitem.parquet"
2828

2929
# This example serves to demonstrate alternate approaches to answering the
3030
# question "return all of the rows that have a specific combination of these

examples/tpch/convert_data_to_parquet.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
as will be generated by the script provided in this repository.
2323
"""
2424

25-
import os
25+
from pathlib import Path
2626

2727
import datafusion
2828
import pyarrow as pa
@@ -116,7 +116,7 @@
116116
("S_COMMENT", pa.string()),
117117
]
118118

119-
curr_dir = os.path.dirname(os.path.abspath(__file__))
119+
curr_dir = Path(__file__).resolve().parent
120120
for filename, curr_schema_val in all_schemas.items():
121121
# For convenience, go ahead and convert the schema column names to lowercase
122122
curr_schema = [(s[0].lower(), s[1]) for s in curr_schema_val]
@@ -132,10 +132,8 @@
132132

133133
schema = pa.schema(curr_schema)
134134

135-
source_file = os.path.abspath(
136-
os.path.join(curr_dir, f"../../benchmarks/tpch/data/{filename}.csv")
137-
)
138-
dest_file = os.path.abspath(os.path.join(curr_dir, f"./data/{filename}.parquet"))
135+
source_file = (curr_dir / f"../../benchmarks/tpch/data/{filename}.csv").resolve()
136+
dest_file = (curr_dir / f"./data/{filename}.parquet").resolve()
139137

140138
df = ctx.read_csv(source_file, schema=schema, has_header=False, delimiter="|")
141139

examples/tpch/q07_volume_shipping.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@
8080
# not match these will result in a null value and then get filtered out.
8181
#
8282
# To do the same using a simple filter would be:
83-
# df_nation = df_nation.filter((F.col("n_name") == nation_1) | (F.col("n_name") == nation_2))
83+
# df_nation = df_nation.filter((F.col("n_name") == nation_1) | (F.col("n_name") == nation_2)) # noqa: ERA001
8484
df_nation = df_nation.with_column(
8585
"n_name",
8686
F.case(col("n_name"))

examples/tpch/q12_ship_mode_order_priority.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@
7373
# matches either of the two values, but we want to show doing some array operations in this
7474
# example. If you want to see this done with filters, comment out the above line and uncomment
7575
# this one.
76-
# df = df.filter((col("l_shipmode") == lit(SHIP_MODE_1)) | (col("l_shipmode") == lit(SHIP_MODE_2)))
76+
# df = df.filter((col("l_shipmode") == lit(SHIP_MODE_1)) | (col("l_shipmode") == lit(SHIP_MODE_2))) # noqa: ERA001
7777

7878

7979
# We need order priority, so join order df to line item

examples/tpch/util.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,18 +19,16 @@
1919
Common utilities for running TPC-H examples.
2020
"""
2121

22-
import os
22+
from pathlib import Path
2323

2424

25-
def get_data_path(filename: str) -> str:
26-
path = os.path.dirname(os.path.abspath(__file__))
25+
def get_data_path(filename: str) -> Path:
26+
path = Path(__file__).resolve().parent
2727

28-
return os.path.join(path, "data", filename)
28+
return path / "data" / filename
2929

3030

31-
def get_answer_file(answer_file: str) -> str:
32-
path = os.path.dirname(os.path.abspath(__file__))
31+
def get_answer_file(answer_file: str) -> Path:
32+
path = Path(__file__).resolve().parent
3333

34-
return os.path.join(
35-
path, "../../benchmarks/tpch/data/answers", f"{answer_file}.out"
36-
)
34+
return path / "../../benchmarks/tpch/data/answers" / f"{answer_file}.out"

0 commit comments

Comments
 (0)