From 01b59ed09f7008916e682a0b82de959f544a6f62 Mon Sep 17 00:00:00 2001 From: Arthur Tyukayev Date: Sat, 4 Jan 2025 22:51:55 +0400 Subject: [PATCH 1/3] Renamed parsed fields to clear up ruff errors --- safer/html.py | 139 ++++++++++++++++++++++++++++------------------- safer/results.py | 6 +- 2 files changed, 86 insertions(+), 59 deletions(-) diff --git a/safer/html.py b/safer/html.py index 8e6f552..786c020 100644 --- a/safer/html.py +++ b/safer/html.py @@ -4,10 +4,16 @@ def debug_print_element(e): - print(html.tostring(e).decode("utf-8")) + print(html.tostring(e)) print("\n\n\n") +def get_first_item_or_none(items): + if len(items) == 0: + return None + return items[0] + + def process_extracted_text(extracted_data): if isinstance(extracted_data, list): if len(extracted_data) > 1: @@ -145,18 +151,33 @@ def process_final_dictionary(data): # Formatting Mileage Year to a dictionary data["mcs_150_mileage_year"] = { - "mileage": int(data["mcs_150_mileage_year"].split(" ")[0].replace(",", "")) - if data["mcs_150_mileage_year"] - else None, - "year": int( - data["mcs_150_mileage_year"].split(" ")[1].replace("(", "").replace(")", "") - ) - if data["mcs_150_mileage_year"] - else None, + "mileage": ( + int(data["mcs_150_mileage_year"].split(" ")[0].replace(",", "")) + if data["mcs_150_mileage_year"] + else None + ), + "year": ( + int( + data["mcs_150_mileage_year"] + .split(" ")[1] + .replace("(", "") + .replace(")", "") + ) + if data["mcs_150_mileage_year"] + else None + ), } # HTML Returns -- for the Duns number when it should just be None or blank data["duns_number"] = data["duns_number"] if data["duns_number"] != "--" else None + data["state_carrier_id"] = ( + data["state_carrier_id"] if data["state_carrier_id"] != "" else None + ) + data["dba_name"] = data["dba_name"] if data["dba_name"] != "" else None + + if data["out_of_service_date"] == "None": + data["out_of_service_date"] = None + return data @@ -186,7 +207,7 @@ def process_search_result_html(tree): # Parsing the USDOT number out of the xpath c_id_parsed = parse_qsl(c_id[10:]) # Getting the Raw HTML. - c_raw_html = html.tostring(item, pretty_print=True).decode("utf-8") + c_raw_html = html.tostring(item, pretty_print=True) result_set.append( { @@ -226,97 +247,103 @@ def process_company_snapshot(tree): crashes_tables = tree.xpath('//table[@summary="Crashes"]') fields = { - "entity_type": "tr[2]/td/text()", - "legal_name": "tr[4]/td/text()", - "dba_name": "tr[5]/td/text()", - "physical_address": "tr[6]/td/text()", - "phone": "tr[7]/td/text()", - "mailing_address": "tr[8]/td/text()", - "usdot": "tr[9]/td[1]/text()", - "state_carrier_id": "tr[9]/td[2]/text()", - "mc_mx_ff_numbers": "tr[10]/td[1]/a/text()", - "duns_number": "tr[10]/td[2]/text()", - "power_units": "tr[11]/td[1]/text()", - "drivers": "tr[11]/td[2]/font/b/text()", - "mcs_150_form_date": "tr[12]/td[1]/text()", - "mcs_150_mileage_year": "tr[12]/td[2]/font/b/text()", + "entity_type": "tr[3]/td/text()", + "usdot_status": "tr[4]/td[1]/text()", + "legal_name": "tr[11]/td/text()", + "dba_name": "tr[12]/td/text()", + "physical_address": "tr[13]/td/text()", + "phone": "tr[14]/td/text()", + "mailing_address": "tr[15]/td/text()", + "usdot": "tr[5]/td[1]/text()", + "state_carrier_id": "tr[5]/td[2]/text()", + "mc_mx_ff_numbers": "tr[9]/td[1]/a/text()", + "duns_number": "tr[16]/td/text()", + "power_units": "tr[17]/td[1]/text()", + "drivers": "tr[17]/td[2]/font/b/text()", + "mcs_150_form_date": "tr[6]/td[1]/text()", + "mcs_150_mileage_year": "tr[6]/td[2]/font/b/text()", } + parsed_fields = {} + for item in fields: - fields[item] = process_extracted_text(general_info_table.xpath(fields[item])) + parsed_fields[item] = process_extracted_text( + general_info_table.xpath(fields[item]) + ) # Out of Service Date comes in as a string 'None' if None - fields["out_of_service_date"] = process_extracted_text( - general_info_table.xpath("tr[3]/td[2]/text()") + parsed_fields["out_of_service_date"] = process_extracted_text( + general_info_table.xpath("tr[4]/td[2]/text()") ) - if fields["out_of_service_date"] == "None": - fields["out_of_service_date"] = None # Getting Operating Status out of HTML, must be done outside of loop because it requires more decisiveness if not isinstance( - process_extracted_text(general_info_table.xpath("tr[3]/td[1]/text()")), str + process_extracted_text(general_info_table.xpath("tr[8]/td")), + list, ): - fields["operating_status"] = process_extracted_text( - general_info_table.xpath("tr[3]/td[1]/font/b/text()") + operating_status_data_list = general_info_table.xpath( + "tr[8]/td[1]/font/b/text()" ) + operating_authority_status = get_first_item_or_none(operating_status_data_list) + parsed_fields["operating_authority_status"] = operating_authority_status else: - fields["operating_status"] = process_extracted_text( - general_info_table.xpath("tr[3]/td[1]/text()") - ) + parsed_fields["operating_authority_status"] = None # Getting Operation Classifications from a list of classifications if the table exists in the HTML # Checks the HTML for all table rows that contain and X next to them if len(operation_classification_table) == 1: - fields["operation_classification"] = [] + parsed_fields["operation_classification"] = [] operation_classification_table = operation_classification_table.pop(0) for classification in operation_classification_table.xpath( "tr[2]/td/table/tr[.//td[@class='queryfield']/text() = 'X']/td/font/text()" ): - fields["operation_classification"].append(classification) + parsed_fields["operation_classification"].append(classification) last_val = operation_classification_table.xpath( "tr[2]/td[3]/table/tr[5]/td[2]/text()" ) if len(last_val) > 0: - fields["operation_classification"].append(process_extracted_text(last_val)) + parsed_fields["operation_classification"].append( + process_extracted_text(last_val) + ) # Parsing out Carrier Operation from the list of types # Checks the HTML for all table rows that contain and X next to them if len(carrier_operation_table) == 1: - fields["carrier_operation"] = [] + parsed_fields["carrier_operation"] = [] carrier_operation_table = carrier_operation_table.pop(0) for operation in carrier_operation_table.xpath( "tr[2]/td/table/tr[.//td[@class='queryfield']/text() = 'X']/td/font/text()" ): - fields["carrier_operation"].append(operation) + parsed_fields["carrier_operation"].append(operation) # Parsing out Shipper Opertation from the list of types if the table exists in the HTML # Checks the HTML for all table rows that contain and X next to thema if len(hm_shipper_operation_table) == 1: - fields["hm_shipper_operation"] = [] + parsed_fields["hm_shipper_operation"] = [] hm_shipper_operation_table = hm_shipper_operation_table.pop(0) for operation in hm_shipper_operation_table.xpath( "tr[2]/td/table/tr[.//td[@class='queryfield']/text() = 'X']/td/font/text()" ): - fields["hm_shipper_operation"].append(operation) + parsed_fields["hm_shipper_operation"].append(operation) else: - fields["hm_shipper_operation"] = None + parsed_fields["hm_shipper_operation"] = None # Parsing out the type of cargo this carrier is authorized or carry if the table exists in the HTML # Checks the HTML for all table rows that contain and X next to them if len(cargo_carried_table) == 1: - fields["cargo_carried"] = [] + parsed_fields["cargo_carried"] = [] cargo_carried_table = cargo_carried_table.pop(0) for cargo in cargo_carried_table.xpath( "tr[2]/td/table/tr[.//td[@class='queryfield']/text() = 'X']/td/font/text()" ): - fields["cargo_carried"].append(cargo) + parsed_fields["cargo_carried"].append(cargo) # Parsing the data from tables into nested dictionaries. # Parsing the inspections in the United Status if the tables exist in the HTML if len(inspections_tables) == 2: us_inspections_table = inspections_tables[0] - fields["united_states_inspections"] = { + parsed_fields["united_states_inspections"] = { "vehicle": { "inspections": process_extracted_text( us_inspections_table.xpath("tr[2]/td[1]/text()") @@ -378,7 +405,7 @@ def process_company_snapshot(tree): # Parsing the crashes in the United States if the tables exist in the HTML if len(crashes_tables) == 2: us_crashes_table = crashes_tables[0] - fields["united_states_crashes"] = { + parsed_fields["united_states_crashes"] = { "fatal": process_extracted_text( us_crashes_table.xpath("tr[2]/td[1]/text()") ), @@ -394,7 +421,7 @@ def process_company_snapshot(tree): # Parsing the inspections in Canada if the tables exist in the HTML if len(inspections_tables) == 2: canada_inspections_table = inspections_tables[1] - fields["canada_inspections"] = { + parsed_fields["canada_inspections"] = { "vehicle": { "inspections": process_extracted_text( canada_inspections_table.xpath("tr[2]/td[1]/text()") @@ -422,7 +449,7 @@ def process_company_snapshot(tree): # Parsing the crashes in Canada if the tables exist in the HTML if len(crashes_tables) == 2: canada_crashes_table = crashes_tables[1] - fields["canada_crashes"] = { + parsed_fields["canada_crashes"] = { "fatal": process_extracted_text( canada_crashes_table.xpath("tr[2]/td[1]/text()") ), @@ -440,23 +467,23 @@ def process_company_snapshot(tree): # Parsing the Safety Rating if it exists in the HTML if len(safety_rating_table) == 1: safety_rating_table = safety_rating_table.pop(0) - fields["safety_rating_date"] = process_extracted_text( + parsed_fields["safety_rating_date"] = process_extracted_text( safety_rating_table.xpath("tr[2]/td[1]/text()") ) - fields["safety_review_date"] = process_extracted_text( + parsed_fields["safety_review_date"] = process_extracted_text( safety_rating_table.xpath("tr[2]/td[2]/text()") ) - fields["safety_rating"] = process_extracted_text( + parsed_fields["safety_rating"] = process_extracted_text( safety_rating_table.xpath("tr[3]/td[1]/text()") ) - fields["safety_type"] = process_extracted_text( + parsed_fields["safety_type"] = process_extracted_text( safety_rating_table.xpath("tr[3]/td[2]/text()") ) # Parsing the latest update date. - fields["latest_update"] = process_extracted_text( + parsed_fields["latest_update"] = process_extracted_text( tree.xpath("//b/font[@color='#0000C0']/text()")[-1] ) - fields = process_final_dictionary(fields) - return fields + parsed_fields = process_final_dictionary(parsed_fields) + return parsed_fields diff --git a/safer/results.py b/safer/results.py index 358efbf..785467a 100644 --- a/safer/results.py +++ b/safer/results.py @@ -22,7 +22,7 @@ def __init__(self, data): # Mapping values. self.__entity_type = data["entity_type"] - self.__operating_status = data["operating_status"] + self.__operating_authority_status = data["operating_authority_status"] self.__legal_name = data["legal_name"] self.__dba_name = data["dba_name"] self.__duns_number = data["duns_number"] @@ -84,8 +84,8 @@ def __init__(self, data): self.__raw["url"] = self.__url @property - def operating_status(self): - return self.__operating_status + def operating_authority_status_status(self): + return self.__operating_authority_status @property def safety_review_data(self): From 2d82229a136564eef994fb9ef14ba57dc1537b07 Mon Sep 17 00:00:00 2001 From: Arthur Tyukayev Date: Sat, 4 Jan 2025 23:34:34 +0400 Subject: [PATCH 2/3] Updated parsing of operating status for both wrapped and non-wrapped --- safer/html.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/safer/html.py b/safer/html.py index 786c020..ce07571 100644 --- a/safer/html.py +++ b/safer/html.py @@ -168,6 +168,11 @@ def process_final_dictionary(data): ), } + data["safety_rating_date"] = None + data["safety_review_date"] = None + data["safety_rating"] = None + data["safety_type"] = None + # HTML Returns -- for the Duns number when it should just be None or blank data["duns_number"] = data["duns_number"] if data["duns_number"] != "--" else None data["state_carrier_id"] = ( @@ -277,15 +282,18 @@ def process_company_snapshot(tree): ) # Getting Operating Status out of HTML, must be done outside of loop because it requires more decisiveness - if not isinstance( - process_extracted_text(general_info_table.xpath("tr[8]/td")), - list, - ): - operating_status_data_list = general_info_table.xpath( - "tr[8]/td[1]/font/b/text()" + operating_status_data_items_length = len(general_info_table.xpath("tr[8]/td")) + if operating_status_data_items_length > 0: + operating_status_font_wrapped_based = process_extracted_text( + general_info_table.xpath("tr[8]/td[1]/font/b/text()") + ) + operating_status_non_wrapped = process_extracted_text( + general_info_table.xpath("tr[8]/td[1]/text()") + ) + + parsed_fields["operating_authority_status"] = ( + operating_status_font_wrapped_based or operating_status_non_wrapped ) - operating_authority_status = get_first_item_or_none(operating_status_data_list) - parsed_fields["operating_authority_status"] = operating_authority_status else: parsed_fields["operating_authority_status"] = None From 8cd3a03761d88c20798d97c72fa4ce4fb31571a5 Mon Sep 17 00:00:00 2001 From: Arthur Tyukayev Date: Sat, 4 Jan 2025 23:48:22 +0400 Subject: [PATCH 3/3] Fixed pylint recommendations --- safer/html.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/safer/html.py b/safer/html.py index ce07571..0c10d4a 100644 --- a/safer/html.py +++ b/safer/html.py @@ -271,9 +271,9 @@ def process_company_snapshot(tree): parsed_fields = {} - for item in fields: - parsed_fields[item] = process_extracted_text( - general_info_table.xpath(fields[item]) + for item in fields.items(): + parsed_fields[item[0]] = process_extracted_text( + general_info_table.xpath(item[1]) ) # Out of Service Date comes in as a string 'None' if None