From 01b59ed09f7008916e682a0b82de959f544a6f62 Mon Sep 17 00:00:00 2001
From: Arthur Tyukayev <github_arthur@tyukayev.com>
Date: Sat, 4 Jan 2025 22:51:55 +0400
Subject: [PATCH 1/3] Renamed parsed fields to clear up ruff errors

---
 safer/html.py    | 139 ++++++++++++++++++++++++++++-------------------
 safer/results.py |   6 +-
 2 files changed, 86 insertions(+), 59 deletions(-)

diff --git a/safer/html.py b/safer/html.py
index 8e6f552..786c020 100644
--- a/safer/html.py
+++ b/safer/html.py
@@ -4,10 +4,16 @@
 
 
 def debug_print_element(e):
-    print(html.tostring(e).decode("utf-8"))
+    print(html.tostring(e))
     print("\n\n\n")
 
 
+def get_first_item_or_none(items):
+    if len(items) == 0:
+        return None
+    return items[0]
+
+
 def process_extracted_text(extracted_data):
     if isinstance(extracted_data, list):
         if len(extracted_data) > 1:
@@ -145,18 +151,33 @@ def process_final_dictionary(data):
 
     # Formatting Mileage Year to a dictionary
     data["mcs_150_mileage_year"] = {
-        "mileage": int(data["mcs_150_mileage_year"].split(" ")[0].replace(",", ""))
-        if data["mcs_150_mileage_year"]
-        else None,
-        "year": int(
-            data["mcs_150_mileage_year"].split(" ")[1].replace("(", "").replace(")", "")
-        )
-        if data["mcs_150_mileage_year"]
-        else None,
+        "mileage": (
+            int(data["mcs_150_mileage_year"].split(" ")[0].replace(",", ""))
+            if data["mcs_150_mileage_year"]
+            else None
+        ),
+        "year": (
+            int(
+                data["mcs_150_mileage_year"]
+                .split(" ")[1]
+                .replace("(", "")
+                .replace(")", "")
+            )
+            if data["mcs_150_mileage_year"]
+            else None
+        ),
     }
 
     # HTML Returns -- for the Duns number when it should just be None or blank
     data["duns_number"] = data["duns_number"] if data["duns_number"] != "--" else None
+    data["state_carrier_id"] = (
+        data["state_carrier_id"] if data["state_carrier_id"] != "" else None
+    )
+    data["dba_name"] = data["dba_name"] if data["dba_name"] != "" else None
+
+    if data["out_of_service_date"] == "None":
+        data["out_of_service_date"] = None
+
     return data
 
 
@@ -186,7 +207,7 @@ def process_search_result_html(tree):
         # Parsing the USDOT number out of the xpath
         c_id_parsed = parse_qsl(c_id[10:])
         # Getting the Raw HTML.
-        c_raw_html = html.tostring(item, pretty_print=True).decode("utf-8")
+        c_raw_html = html.tostring(item, pretty_print=True)
 
         result_set.append(
             {
@@ -226,97 +247,103 @@ def process_company_snapshot(tree):
     crashes_tables = tree.xpath('//table[@summary="Crashes"]')
 
     fields = {
-        "entity_type": "tr[2]/td/text()",
-        "legal_name": "tr[4]/td/text()",
-        "dba_name": "tr[5]/td/text()",
-        "physical_address": "tr[6]/td/text()",
-        "phone": "tr[7]/td/text()",
-        "mailing_address": "tr[8]/td/text()",
-        "usdot": "tr[9]/td[1]/text()",
-        "state_carrier_id": "tr[9]/td[2]/text()",
-        "mc_mx_ff_numbers": "tr[10]/td[1]/a/text()",
-        "duns_number": "tr[10]/td[2]/text()",
-        "power_units": "tr[11]/td[1]/text()",
-        "drivers": "tr[11]/td[2]/font/b/text()",
-        "mcs_150_form_date": "tr[12]/td[1]/text()",
-        "mcs_150_mileage_year": "tr[12]/td[2]/font/b/text()",
+        "entity_type": "tr[3]/td/text()",
+        "usdot_status": "tr[4]/td[1]/text()",
+        "legal_name": "tr[11]/td/text()",
+        "dba_name": "tr[12]/td/text()",
+        "physical_address": "tr[13]/td/text()",
+        "phone": "tr[14]/td/text()",
+        "mailing_address": "tr[15]/td/text()",
+        "usdot": "tr[5]/td[1]/text()",
+        "state_carrier_id": "tr[5]/td[2]/text()",
+        "mc_mx_ff_numbers": "tr[9]/td[1]/a/text()",
+        "duns_number": "tr[16]/td/text()",
+        "power_units": "tr[17]/td[1]/text()",
+        "drivers": "tr[17]/td[2]/font/b/text()",
+        "mcs_150_form_date": "tr[6]/td[1]/text()",
+        "mcs_150_mileage_year": "tr[6]/td[2]/font/b/text()",
     }
 
+    parsed_fields = {}
+
     for item in fields:
-        fields[item] = process_extracted_text(general_info_table.xpath(fields[item]))
+        parsed_fields[item] = process_extracted_text(
+            general_info_table.xpath(fields[item])
+        )
 
     # Out of Service Date comes in as a string 'None' if None
-    fields["out_of_service_date"] = process_extracted_text(
-        general_info_table.xpath("tr[3]/td[2]/text()")
+    parsed_fields["out_of_service_date"] = process_extracted_text(
+        general_info_table.xpath("tr[4]/td[2]/text()")
     )
-    if fields["out_of_service_date"] == "None":
-        fields["out_of_service_date"] = None
 
     # Getting Operating Status out of HTML, must be done outside of loop because it requires more decisiveness
     if not isinstance(
-        process_extracted_text(general_info_table.xpath("tr[3]/td[1]/text()")), str
+        process_extracted_text(general_info_table.xpath("tr[8]/td")),
+        list,
     ):
-        fields["operating_status"] = process_extracted_text(
-            general_info_table.xpath("tr[3]/td[1]/font/b/text()")
+        operating_status_data_list = general_info_table.xpath(
+            "tr[8]/td[1]/font/b/text()"
         )
+        operating_authority_status = get_first_item_or_none(operating_status_data_list)
+        parsed_fields["operating_authority_status"] = operating_authority_status
     else:
-        fields["operating_status"] = process_extracted_text(
-            general_info_table.xpath("tr[3]/td[1]/text()")
-        )
+        parsed_fields["operating_authority_status"] = None
 
     # Getting Operation Classifications from a list of classifications if the table exists in the HTML
     # Checks the HTML for all table rows that contain and X next to them
     if len(operation_classification_table) == 1:
-        fields["operation_classification"] = []
+        parsed_fields["operation_classification"] = []
         operation_classification_table = operation_classification_table.pop(0)
         for classification in operation_classification_table.xpath(
             "tr[2]/td/table/tr[.//td[@class='queryfield']/text() = 'X']/td/font/text()"
         ):
-            fields["operation_classification"].append(classification)
+            parsed_fields["operation_classification"].append(classification)
         last_val = operation_classification_table.xpath(
             "tr[2]/td[3]/table/tr[5]/td[2]/text()"
         )
         if len(last_val) > 0:
-            fields["operation_classification"].append(process_extracted_text(last_val))
+            parsed_fields["operation_classification"].append(
+                process_extracted_text(last_val)
+            )
 
     # Parsing out Carrier Operation from the list of types
     # Checks the HTML for all table rows that contain and X next to them
     if len(carrier_operation_table) == 1:
-        fields["carrier_operation"] = []
+        parsed_fields["carrier_operation"] = []
         carrier_operation_table = carrier_operation_table.pop(0)
         for operation in carrier_operation_table.xpath(
             "tr[2]/td/table/tr[.//td[@class='queryfield']/text() = 'X']/td/font/text()"
         ):
-            fields["carrier_operation"].append(operation)
+            parsed_fields["carrier_operation"].append(operation)
 
     # Parsing out Shipper Opertation from the list of types if the table exists in the HTML
     # Checks the HTML for all table rows that contain and X next to thema
     if len(hm_shipper_operation_table) == 1:
-        fields["hm_shipper_operation"] = []
+        parsed_fields["hm_shipper_operation"] = []
         hm_shipper_operation_table = hm_shipper_operation_table.pop(0)
         for operation in hm_shipper_operation_table.xpath(
             "tr[2]/td/table/tr[.//td[@class='queryfield']/text() = 'X']/td/font/text()"
         ):
-            fields["hm_shipper_operation"].append(operation)
+            parsed_fields["hm_shipper_operation"].append(operation)
     else:
-        fields["hm_shipper_operation"] = None
+        parsed_fields["hm_shipper_operation"] = None
 
     # Parsing out the type of cargo this carrier is authorized or carry if the table exists in the HTML
     # Checks the HTML for all table rows that contain and X next to them
     if len(cargo_carried_table) == 1:
-        fields["cargo_carried"] = []
+        parsed_fields["cargo_carried"] = []
         cargo_carried_table = cargo_carried_table.pop(0)
         for cargo in cargo_carried_table.xpath(
             "tr[2]/td/table/tr[.//td[@class='queryfield']/text() = 'X']/td/font/text()"
         ):
-            fields["cargo_carried"].append(cargo)
+            parsed_fields["cargo_carried"].append(cargo)
 
     # Parsing the data from tables into nested dictionaries.
 
     # Parsing the inspections in the United Status if the tables exist in the HTML
     if len(inspections_tables) == 2:
         us_inspections_table = inspections_tables[0]
-        fields["united_states_inspections"] = {
+        parsed_fields["united_states_inspections"] = {
             "vehicle": {
                 "inspections": process_extracted_text(
                     us_inspections_table.xpath("tr[2]/td[1]/text()")
@@ -378,7 +405,7 @@ def process_company_snapshot(tree):
     # Parsing the crashes in the United States if the tables exist in the HTML
     if len(crashes_tables) == 2:
         us_crashes_table = crashes_tables[0]
-        fields["united_states_crashes"] = {
+        parsed_fields["united_states_crashes"] = {
             "fatal": process_extracted_text(
                 us_crashes_table.xpath("tr[2]/td[1]/text()")
             ),
@@ -394,7 +421,7 @@ def process_company_snapshot(tree):
     # Parsing the inspections in Canada if the tables exist in the HTML
     if len(inspections_tables) == 2:
         canada_inspections_table = inspections_tables[1]
-        fields["canada_inspections"] = {
+        parsed_fields["canada_inspections"] = {
             "vehicle": {
                 "inspections": process_extracted_text(
                     canada_inspections_table.xpath("tr[2]/td[1]/text()")
@@ -422,7 +449,7 @@ def process_company_snapshot(tree):
     # Parsing the crashes in Canada if the tables exist in the HTML
     if len(crashes_tables) == 2:
         canada_crashes_table = crashes_tables[1]
-        fields["canada_crashes"] = {
+        parsed_fields["canada_crashes"] = {
             "fatal": process_extracted_text(
                 canada_crashes_table.xpath("tr[2]/td[1]/text()")
             ),
@@ -440,23 +467,23 @@ def process_company_snapshot(tree):
     # Parsing the Safety Rating if it exists in the HTML
     if len(safety_rating_table) == 1:
         safety_rating_table = safety_rating_table.pop(0)
-        fields["safety_rating_date"] = process_extracted_text(
+        parsed_fields["safety_rating_date"] = process_extracted_text(
             safety_rating_table.xpath("tr[2]/td[1]/text()")
         )
-        fields["safety_review_date"] = process_extracted_text(
+        parsed_fields["safety_review_date"] = process_extracted_text(
             safety_rating_table.xpath("tr[2]/td[2]/text()")
         )
-        fields["safety_rating"] = process_extracted_text(
+        parsed_fields["safety_rating"] = process_extracted_text(
             safety_rating_table.xpath("tr[3]/td[1]/text()")
         )
-        fields["safety_type"] = process_extracted_text(
+        parsed_fields["safety_type"] = process_extracted_text(
             safety_rating_table.xpath("tr[3]/td[2]/text()")
         )
 
     # Parsing the latest update date.
-    fields["latest_update"] = process_extracted_text(
+    parsed_fields["latest_update"] = process_extracted_text(
         tree.xpath("//b/font[@color='#0000C0']/text()")[-1]
     )
 
-    fields = process_final_dictionary(fields)
-    return fields
+    parsed_fields = process_final_dictionary(parsed_fields)
+    return parsed_fields
diff --git a/safer/results.py b/safer/results.py
index 358efbf..785467a 100644
--- a/safer/results.py
+++ b/safer/results.py
@@ -22,7 +22,7 @@ def __init__(self, data):
 
         # Mapping values.
         self.__entity_type = data["entity_type"]
-        self.__operating_status = data["operating_status"]
+        self.__operating_authority_status = data["operating_authority_status"]
         self.__legal_name = data["legal_name"]
         self.__dba_name = data["dba_name"]
         self.__duns_number = data["duns_number"]
@@ -84,8 +84,8 @@ def __init__(self, data):
         self.__raw["url"] = self.__url
 
     @property
-    def operating_status(self):
-        return self.__operating_status
+    def operating_authority_status_status(self):
+        return self.__operating_authority_status
 
     @property
     def safety_review_data(self):

From 2d82229a136564eef994fb9ef14ba57dc1537b07 Mon Sep 17 00:00:00 2001
From: Arthur Tyukayev <github_arthur@tyukayev.com>
Date: Sat, 4 Jan 2025 23:34:34 +0400
Subject: [PATCH 2/3] Updated parsing of operating status for both wrapped and
 non-wrapped

---
 safer/html.py | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/safer/html.py b/safer/html.py
index 786c020..ce07571 100644
--- a/safer/html.py
+++ b/safer/html.py
@@ -168,6 +168,11 @@ def process_final_dictionary(data):
         ),
     }
 
+    data["safety_rating_date"] = None
+    data["safety_review_date"] = None
+    data["safety_rating"] = None
+    data["safety_type"] = None
+
     # HTML Returns -- for the Duns number when it should just be None or blank
     data["duns_number"] = data["duns_number"] if data["duns_number"] != "--" else None
     data["state_carrier_id"] = (
@@ -277,15 +282,18 @@ def process_company_snapshot(tree):
     )
 
     # Getting Operating Status out of HTML, must be done outside of loop because it requires more decisiveness
-    if not isinstance(
-        process_extracted_text(general_info_table.xpath("tr[8]/td")),
-        list,
-    ):
-        operating_status_data_list = general_info_table.xpath(
-            "tr[8]/td[1]/font/b/text()"
+    operating_status_data_items_length = len(general_info_table.xpath("tr[8]/td"))
+    if operating_status_data_items_length > 0:
+        operating_status_font_wrapped_based = process_extracted_text(
+            general_info_table.xpath("tr[8]/td[1]/font/b/text()")
+        )
+        operating_status_non_wrapped = process_extracted_text(
+            general_info_table.xpath("tr[8]/td[1]/text()")
+        )
+
+        parsed_fields["operating_authority_status"] = (
+            operating_status_font_wrapped_based or operating_status_non_wrapped
         )
-        operating_authority_status = get_first_item_or_none(operating_status_data_list)
-        parsed_fields["operating_authority_status"] = operating_authority_status
     else:
         parsed_fields["operating_authority_status"] = None
 

From 8cd3a03761d88c20798d97c72fa4ce4fb31571a5 Mon Sep 17 00:00:00 2001
From: Arthur Tyukayev <github_arthur@tyukayev.com>
Date: Sat, 4 Jan 2025 23:48:22 +0400
Subject: [PATCH 3/3] Fixed pylint recommendations

---
 safer/html.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/safer/html.py b/safer/html.py
index ce07571..0c10d4a 100644
--- a/safer/html.py
+++ b/safer/html.py
@@ -271,9 +271,9 @@ def process_company_snapshot(tree):
 
     parsed_fields = {}
 
-    for item in fields:
-        parsed_fields[item] = process_extracted_text(
-            general_info_table.xpath(fields[item])
+    for item in fields.items():
+        parsed_fields[item[0]] = process_extracted_text(
+            general_info_table.xpath(item[1])
         )
 
     # Out of Service Date comes in as a string 'None' if None