-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
445 lines (398 loc) · 13.6 KB
/
main.py
File metadata and controls
445 lines (398 loc) · 13.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
import argparse
from dotenv import dotenv_values
from datetime import datetime
import os
from pathlib import Path
import sys
from diff import run_diff
from db.mongo import MongoClient
from utils.logging import getLogger
from utils import fetch
from export import (
get_files_from_db,
export_lists,
export_label_collection_as_csv_zip,
)
_logger = getLogger("main")
_config = dict(
dotenv_values(
os.path.join(os.path.dirname(os.path.realpath(__file__)), ".", ".env")
)
)
# resource folders (contains data files read and written by this package)
RESOURCE_FOLDER = "resources"
ORANGE_BOOK_FOLDER = os.path.join(RESOURCE_FOLDER, "Orange_Book")
PROCESSED_LOGS = os.path.join(RESOURCE_FOLDER, "processed_log")
# csv log files (used by package internally to track completed database tasks)
# for diff module
PROCESSED_ID_DIFF_FILE = os.path.join(PROCESSED_LOGS, "diff_processed_id.csv")
PROCESSED_NDA_DIFF_FILE = os.path.join(PROCESSED_LOGS, "diff_processed_nda.csv")
UNPROCESSED_ID_DIFF_FILE = os.path.join(
PROCESSED_LOGS, "diff_unprocessed_id.csv"
)
# for similarity module
PROCESSED_ID_SIMILARITY_FILE = os.path.join(
PROCESSED_LOGS, "similarity_processed_id.csv"
)
PROCESSED_NDA_SIMILARITY_FILE = os.path.join(
PROCESSED_LOGS, "similarity_processed_NDA.csv"
)
UNPROCESSED_ID_SIMILARITY_FILE = os.path.join(
PROCESSED_LOGS, "similarity_unprocessed_id.csv"
)
UNPROCESSED_NDA_SIMILARITY_FILE = os.path.join(
PROCESSED_LOGS, "similarity_unprocessed_NDA.csv"
)
def parse_args():
parser = argparse.ArgumentParser(
description="Running this package without any optional argument will calculate and store diffs between adjacent labels (by date) for each drug as defined by NDA number(s) into the label collection of the MongoDB database. The name of the MongoDB database is set in the .env file. The diffs will also be collated to patent claims from the patents collection of the MongoDB database. "
f"Labels that are already processed are stored in {PROCESSED_ID_DIFF_FILE} and {PROCESSED_ID_SIMILARITY_FILE}. These labels will not be re-processed unless optional argument '-r' is set. "
"Running optional arguments other than '-r' will not additionally run the diffing steps or diffs-to-patent-claims mapping unless those flags are set."
)
parser.add_argument(
"-an",
"--all_NDA_from_Orange_Book",
nargs="?",
type=Path,
const=Path(__file__).absolute().parent
/ "assets"
/ "db_state"
/ "all_NDA",
help=(
"Output list of all NDA from Orange Book to File_Name. If unset"
", File_Name is '/assets/db_state/all_NDA'."
),
metavar=("File_Name"),
)
parser.add_argument(
"-ap",
"--all_patents_from_Orange_Book",
nargs="?",
type=Path,
const=Path(__file__).absolute().parent
/ "assets"
/ "db_state"
/ "all_patents",
help=(
"Output list of patents from Orange Book to File_Name. If unset, "
"File_Name is '/assets/db_state/all_patents'."
),
metavar=("File_Name"),
)
parser.add_argument(
"-apj",
"--all_patents_from_Orange_Book_json",
nargs="?",
type=Path,
const=Path(__file__).absolute().parent
/ "assets"
/ "db_state"
/ "all_patents.json",
help=(
"Output list of patents from Orange Book to File_Name. If unset, "
"File_Name is '/assets/db_state/all_patents.json'."
),
metavar=("File_Name"),
)
parser.add_argument(
"-mn",
"--missing_NDA_from_database",
nargs="?",
type=Path,
const=Path(__file__).absolute().parent
/ "assets"
/ "db_state"
/ "missing_NDA",
help=(
"Output list of NDA from Orange Book not in MongoDB to "
"File_Name. If unset, File_Name is '/assets/db_state/missing_NDA'."
),
metavar=("File_Name"),
)
parser.add_argument(
"-mp",
"--missing_patents_from_database",
nargs="?",
type=Path,
const=Path(__file__).absolute().parent
/ "assets"
/ "db_state"
/ "missing_patents",
help=(
"Output list of patents from Orange Book not in MongoDB to "
"File_Name. If unset, File_Name is '/assets/db_state/missing_patents'."
),
metavar=("File_Name"),
)
parser.add_argument(
"-mpj",
"--missing_patents_from_database_json",
nargs="?",
type=Path,
const=Path(__file__).absolute().parent
/ "assets"
/ "db_state"
/ "missing_patents.json",
help=(
"Output list of patents from Orange Book not in MongoDB to "
"File_Name. If unset, File_Name is '/assets/db_state/missing_patents.json'."
),
metavar=("File_Name"),
)
parser.add_argument(
"-ob",
"--update_orange_book",
action="store_true",
help=(
"Download the latest monthly update to the Orange Book from "
"https://www.fda.gov/drugs/drug-approvals-and-databases/orange-book-data-files "
f"into '{ORANGE_BOOK_FOLDER}'."
),
)
parser.add_argument(
"-r",
"--rerun",
action="store_true",
help=(
"Delete all process csv's of MongoDB ObjectId and NDAs from folder"
f" '{PROCESSED_LOGS}', then rerun all diffs and similarity."
),
)
parser.add_argument(
"-ril",
"--reimport_labels",
nargs="?",
type=Path,
const=Path(__file__).absolute().parent
/ "resources"
/ "database_latest"
/ "labels.json",
help=(
"Reimport label collection from json file. (for development). "
"If unset, File_Name is '/assets/database_latest/labels.json'."
),
metavar=("File_Name"),
)
parser.add_argument(
"-rilm",
"--reimport_labelmap",
nargs="?",
type=Path,
const=Path(__file__).absolute().parent
/ "resources"
/ "database_latest"
/ "labelmap.json",
help=(
"Reimport labelmap collection from json file. (for development). "
"If unset, File_Name is '/assets/database_latest/labelmap.json'."
),
metavar=("File_Name"),
)
parser.add_argument(
"-rio",
"--reimport_orange_book",
nargs="?",
type=Path,
const=Path(__file__).absolute().parent
/ "resources"
/ "database_latest"
/ "orangebook.json",
help=(
"Reimport orange_book collection from json file. (for development)."
"If unset, File_Name is '/assets/database_latest/orangebook.json'."
),
metavar=("File_Name"),
)
parser.add_argument(
"-rip",
"--reimport_patents",
nargs="?",
type=Path,
const=Path(__file__).absolute().parent
/ "resources"
/ "database_latest"
/ "patents.json",
help=(
"Reimport patent collection from json file. (for development). "
"If unset, File_Name is '/assets/database_latest/patents.json'."
),
metavar=("File_Name"),
)
parser.add_argument(
"-diff",
"--diff",
action="store_true",
help=("Run diffing_algo."),
)
parser.add_argument(
"-similarity",
"--similarity",
action="store_true",
help=("Run similarity module."),
)
parser.add_argument(
"-db2file",
"--db2file",
nargs="?",
type=Path,
const=Path(__file__).absolute().parent / "analysis" / "db2file",
help=(
"Output list of additions and patent claim sets from the database "
"to Folder_Name. If unset, Folder_Name is '/analysis/db2file/'."
),
metavar=("Folder_Name"),
)
parser.add_argument(
"-db2csv",
"--db2csv",
nargs="?",
type=Path,
const=Path(__file__).absolute().parent
/ "resources"
/ "hosted_folder"
/ "db2csv.csv",
help=(
"Output the entire database to a csv file File_Name. If unset"
", File_Name is '/assets/hosted_folder/db2csv.csv'."
),
metavar=("File_Name"),
)
parser.add_argument(
"-since",
"--since",
type=valid_date,
help=(
"The starting date for diffing and similarity comparisons. Use "
"format YYYY-MM-DD."
),
)
parser.add_argument(
"-truncate_scores",
"--truncate_scores",
action="store_true",
help=("Truncate scores to reduce size of database"),
)
return parser.parse_args()
def valid_date(s):
try:
return datetime.strptime(s, "%Y-%m-%d")
except ValueError:
msg = "Not a valid date: '{0}'.".format(s)
raise argparse.ArgumentTypeError(msg)
if __name__ == "__main__":
args = parse_args()
_logger.info(f"Running with args: {args}")
run_diff_and_similarity = False
# download latest Orange Book File from fda.gov
if args.update_orange_book:
url = "https://www.fda.gov/media/76860/download"
file_path = fetch.download(url, ORANGE_BOOK_FOLDER)
fetch.extract_and_clean(file_path)
label_collection_name = _config["MONGODB_LABEL_COLLECTION_NAME"]
labelmap_collection_name = _config["MONGODB_LABELMAP_COLLECTION_NAME"]
patent_collection_name = _config["MONGODB_PATENT_COLLECTION_NAME"]
orange_book_collection_name = _config["MONGODB_ORANGE_BOOK_COLLECTION_NAME"]
mongo_client = MongoClient(
label_collection_name,
labelmap_collection_name,
patent_collection_name,
orange_book_collection_name,
)
# export all patents or NDA from the Orange Book
if args.all_NDA_from_Orange_Book:
export_lists.export_all_NDA(mongo_client, args.all_NDA_from_Orange_Book)
if args.all_patents_from_Orange_Book:
export_lists.export_all_patents(
mongo_client, args.all_patents_from_Orange_Book
)
if args.all_patents_from_Orange_Book_json:
export_lists.export_all_patents(
mongo_client, args.all_patents_from_Orange_Book_json, True
)
# export list of missing patents or NDA from the database
if args.missing_NDA_from_database:
export_lists.export_missing_NDA(
mongo_client, args.missing_NDA_from_database
)
if args.missing_patents_from_database:
export_lists.export_missing_patents(
mongo_client, args.missing_patents_from_database
)
if args.missing_patents_from_database_json:
export_lists.export_missing_patents(
mongo_client, args.missing_patents_from_database_json, True
)
# reimport of label or patent collections; for development
if args.reimport_labels:
mongo_client.reimport_collection(
label_collection_name, args.reimport_labels
)
if args.reimport_labelmap:
mongo_client.reimport_collection(
labelmap_collection_name, args.reimport_labelmap
)
if args.reimport_patents:
mongo_client.reimport_collection(
patent_collection_name, args.reimport_patents
)
if args.reimport_orange_book:
mongo_client.reimport_collection(
orange_book_collection_name, args.reimport_orange_book
)
# rerun all diff and similarity
if args.rerun:
if os.path.exists(PROCESSED_ID_DIFF_FILE):
os.remove(PROCESSED_ID_DIFF_FILE)
if os.path.exists(PROCESSED_NDA_DIFF_FILE):
os.remove(PROCESSED_NDA_DIFF_FILE)
if os.path.exists(PROCESSED_ID_SIMILARITY_FILE):
os.remove(PROCESSED_ID_SIMILARITY_FILE)
if os.path.exists(PROCESSED_NDA_SIMILARITY_FILE):
os.remove(PROCESSED_NDA_SIMILARITY_FILE)
if os.path.exists(UNPROCESSED_ID_DIFF_FILE):
os.remove(UNPROCESSED_ID_DIFF_FILE)
if os.path.exists(UNPROCESSED_ID_SIMILARITY_FILE):
os.remove(UNPROCESSED_ID_SIMILARITY_FILE)
if os.path.exists(UNPROCESSED_NDA_SIMILARITY_FILE):
os.remove(UNPROCESSED_NDA_SIMILARITY_FILE)
run_diff_and_similarity = True
if len(sys.argv) == 1 or args.similarity or args.db2csv:
# for case when no optional arguments are passed
run_diff_and_similarity = True
# if run_diff_and_similarity:
if run_diff_and_similarity:
run_diff.run_diff(
mongo_client,
PROCESSED_ID_DIFF_FILE,
PROCESSED_NDA_DIFF_FILE,
UNPROCESSED_ID_DIFF_FILE,
args.since,
)
# do not run diff again
args.diff = False
from similarity import run_similarity
run_similarity.run_similarity(
mongo_client,
PROCESSED_ID_SIMILARITY_FILE,
PROCESSED_NDA_SIMILARITY_FILE,
UNPROCESSED_ID_SIMILARITY_FILE,
UNPROCESSED_NDA_SIMILARITY_FILE,
args.since,
)
elif args.diff or args.db2file:
run_diff.run_diff(
mongo_client,
PROCESSED_ID_DIFF_FILE,
PROCESSED_NDA_DIFF_FILE,
UNPROCESSED_ID_DIFF_FILE,
args.since,
)
if args.db2file:
get_files_from_db.get_files_from_db(mongo_client, args.db2file)
if args.db2csv:
export_label_collection_as_csv_zip.run_export_csv_zip(
mongo_client, args.db2csv
)
if args.truncate_scores:
from similarity import truncate_score
truncate_score.run_truncation(mongo_client)