Skip to content

Commit 644b54e

Browse files
committed
RING-44425 - input structures, required fields
1 parent 50c5108 commit 644b54e

File tree

5 files changed

+47
-11
lines changed

5 files changed

+47
-11
lines changed

scripts/S3_FSCK/s3_fsck_p0.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -195,11 +195,13 @@ def blob(row):
195195
new_path = os.path.join(PATH, RING, "s3-bucketd")
196196
files = "%s://%s" % (PROTOCOL, new_path)
197197

198-
# reading without a header,
199-
# columns _c0, _c1, _c2 are the default column names of
198+
# reading without a header,
199+
# columns _c0, _c1, _c2 are the default column names of
200200
# columns 1, 2, 3 for the csv
201201
# input structure: (bucket name, s3 object key, sproxyd input key)
202202
# e.g. test,48K_object.01,9BC9C6080ED24A42C2F1A9C78F6BCD5967F70220
203+
# Required Fields:
204+
# - _c2 (sproxyd input key)
203205
df = spark.read.format("csv").option("header", "false").option("inferSchema", "true").option("delimiter", ",").load(files)
204206

205207
# repartition the dataframe to have the same number of partitions as the number of executors * cores

scripts/S3_FSCK/s3_fsck_p1.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,10 +39,14 @@
3939

4040

4141
files = "%s://%s/%s/listkeys.csv" % (PROTOCOL, PATH, RING)
42-
# listkeys.csv structure:
43-
# { RING key, main chunk of the RING key, disk, flag }
42+
# reading without a header,
43+
# columns _c0, _c1, _c2, _c3 are the default column names of
44+
# columns 1, 2, 3, 4 for the csv
45+
# input structure: (RING key, main chunk, disk, flag)
4446
# e.g. 555555A4948FAA554034E155555555A61470C07A,8000004F3F3A54FFEADF8C00000000511470C070,g1disk1,0
45-
# reading listkeys.csv files without a header, the _c0, _c1, _c2, _c3 are the default column names for column 1, 2, 3, 4
47+
# Required Fields:
48+
# - _c1 (main chunk)
49+
# - _c3 (FLAG)
4650
df = spark.read.format("csv").option("header", "false").option("inferSchema", "true").option("delimiter", ",").load(files)
4751

4852
# list the ARC SPLIT main chunks with service ID 50 from column 2

scripts/S3_FSCK/s3_fsck_p2.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,14 +39,30 @@
3939
.getOrCreate()
4040

4141

42-
# s3keys are read from the verifySproxydKeys.js scripts output
42+
# s3keys are generated by verifySproxydKeys.js script and processed by s3_fsck_p0.py
4343
s3keys = "%s://%s/%s/s3fsck/s3-dig-keys.csv" % (PROTOCOL, PATH, RING)
44-
# ringkeys are read from the listkeys.py (or ringsh dump) scripts output
44+
# ringkeys are generated by the listkeys.py (or ringsh dump) script and processed by s3_fsck_p1.py
4545
ringkeys = "%s://%s/%s/s3fsck/arc-keys.csv" % (PROTOCOL, PATH, RING)
4646

47-
# reading with a header, the columns are named. The column _c1 will be whatever column the _c1 header is assigned to
47+
# reading with a header, the columns are named.
48+
# columns digkey, sproxyd input key, subkey are the actual column names of
49+
# columns 1, 2, 3 for the csv
50+
# input structure: (digkey, sproxyd input key, subkey)
51+
# e.g. 7359114991482315D0A5890000,BDE4B9BBEB45711EC2F1A9C78F6BCD59E02C6220,SINGLE
52+
# Required Fields:
53+
# - digkey
54+
# - sproxyd input key
4855
dfs3keys = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(s3keys)
49-
# reading with a header, the columns are named. The column _c1 will be whatever column the _c1 header is assigned to
56+
57+
58+
# reading with a header, the columns are named.
59+
# columns _c1, count, ringkey (main chunk) are the actual column names of
60+
# columns 1, 2, 3 for the csv
61+
# input structure: (digkey, count, ringkey (main chunk))
62+
# e.g. 7359114991482315D0A5890000,BDE4B9BBEB45711EC2F1A9C78F6BCD59E02C6220,SINGLE
63+
# Required Fields:
64+
# - digkey
65+
# - ringkey (main chunk)
5066
dfringkeys = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(ringkeys)
5167

5268
# rename the column _c1 to digkey, the next write will output a header that uses digkey instead of _c1

scripts/S3_FSCK/s3_fsck_p3.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,8 +84,16 @@ def statkey(row):
8484

8585

8686
files = "%s://%s/%s/s3fsck/s3objects-missing.csv" % (PROTOCOL, PATH, RING)
87-
# Create a dataframe from the csv file not using the header, the columns will be _c0, _c1, _c2
87+
88+
# reading without a header,
89+
# columns _c0 is the default column names of
90+
# column 1 for the csv
91+
# input structure: _c0 (main chunk)
92+
# e.g. 998C4DF2FC7389A7C82A9600000000512040C070
93+
# Required Fields:
94+
# - _c0 (main chunk)
8895
df = spark.read.format("csv").option("header", "false").option("inferSchema", "true").load(files)
96+
8997
# Create a resilient distributed dataset (RDD) from the dataframe (logical partitions of data)
9098
# The rdd is a collection of tuples returned from statkey (key, status_code, size)
9199
rdd = df.rdd.map(statkey)

scripts/S3_FSCK/s3_fsck_p4.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,13 @@ def deletekey(row):
6969

7070
files = "%s://%s/%s/s3fsck/s3objects-missing.csv" % (PROTOCOL, PATH, RING)
7171

72-
# reading without a header, the _c0, _c1, _c2, _c3 are the default column names for column 1, 2, 3, 4
72+
# reading without a header,
73+
# columns _c0 is the default column names of
74+
# column 1 for the csv
75+
# input structure: _c0 (main chunk)
76+
# e.g. 998C4DF2FC7389A7C82A9600000000512040C070
77+
# Required Fields:
78+
# - _c0 (main chunk)
7379
df = spark.read.format("csv").option("header", "false").option("inferSchema", "true").load(files)
7480
# rename the column _c0 (column 1) to ringkey
7581
df = df.withColumnRenamed("_c0","ringkey")

0 commit comments

Comments
 (0)