Usefull-py-scripts/pdfcoordpoints.py at main · davcaro99/Usefull-py-scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
"""
This script processes a PDF list of georeferenced localities,
extracting details such as coordinates, names, measurements, and comments into a CSV file.
The data can be modified and used to generate a KMZ file with all information, including images.
The user can modify the information in the script according to the information in the pdf file.
"""

import pdfplumber
import pandas as pd
import re

def extract_pdf_data(pdf_path):
    """
    Extract data from PDF file and return as a list of dictionaries.
    """
    extracted_data = []

    with pdfplumber.open(pdf_path) as pdf:
        print(f"Processing PDF with {len(pdf.pages)} pages...")

        for page_num, page in enumerate(pdf.pages, 1):
            text = page.extract_text()
            print(f"\nProcessing page {page_num}")

            # Split text into sections for each locality
            localities = text.split('Locality No.')

            for loc in localities[1:]:  # Skip first empty split
                try:
                    # Initialize all variables
                    locality_no = None
                    latitude = None
                    longitude = None
                    unit = None
                    potential = None
                    lithology = None
                    comments = None
                    contact_info = None

                    # Extract Locality Number
                    locality_match = re.search(r'^\s*(\d+)', loc)
                    if locality_match:
                        locality_no = locality_match.group(1)
                        print(f"\nProcessing Locality No. {locality_no}")
                    else:
                        print("Could not find Locality Number, skipping...")
                        continue

                    # Extract Coordinates
                    # Look for the specific pattern: latitude followed by longitude
                    coords_pattern = r'(\d+°\s*\d+’\s*\d+(?:[.,]\d+)?’’\s*N)\s+(\d+°\s*\d+’\s*\d+(?:[.,]\d+)?’’\s*W)'
                    coords_match = re.search(coords_pattern, loc)

                    if coords_match:
                        latitude = coords_match.group(1)
                        longitude = coords_match.group(2)


                    # Extract Unit (looking for pattern between "Unit" and "Potential")
                    unit_pattern = r'Unit\s+(.*?)\s+Potential'
                    unit_match = re.search(unit_pattern, loc)
                    if unit_match:
                        unit = unit_match.group(1).strip()

                    # Extract Potential
                    potential_pattern = r'Potential\s+(.*?)\s+Lithology'
                    potential_match = re.search(potential_pattern, loc)
                    if potential_match:
                        potential = potential_match.group(1).strip()

                    # Extract Lithology
                    lithology_pattern = r'Lithology\s+(.*?)\s+Comments'
                    lithology_match = re.search(lithology_pattern, loc, re.DOTALL)
                    if lithology_match:
                        lithology = lithology_match.group(1).strip()

                    # Extract Comments
                    comments_pattern = r'Comments\s+(.*?)\s+(?:Contact information|$)'
                    comments_match = re.search(comments_pattern, loc, re.DOTALL)
                    if comments_match:
                        comments = comments_match.group(1).strip()

                    # Extract Contact Information
                    contact_pattern = r'Contact information\s+(.*?)(?=$)'
                    contact_match = re.search(contact_pattern, loc, re.DOTALL)
                    if contact_match:
                        contact_info = contact_match.group(1).strip()

                    data = {
                        'Locality_No': locality_no,
                        'Latitude': latitude,
                        'Longitude': longitude,
                        'Unit': unit,
                        'Potential': potential,
                        'Lithology': lithology,
                        'Comments': comments,
                        'Contact_Information': contact_info
                    }

                    # Print extracted data for debugging
                    print(f"Extracted data for Locality {locality_no}:")
                    print(f"Coordinates: {latitude} / {longitude}")
                    print(f"Unit: {unit}")
                    print(f"Potential: {potential}")

                    extracted_data.append(data)

                except Exception as e:
                    print(f"Error processing locality: {str(e)}")
                    continue

    if not extracted_data:
        raise ValueError("No data could be extracted from the PDF. Please check the format of your PDF file.")

    return extracted_data

def create_excel(data, output_path):
    """
    Create Excel file from extracted data.
    """
    # Create DataFrame
    df = pd.DataFrame(data)

    # Reorder columns for better readability
    column_order = [
        'Locality_No',
        'Latitude',
        'Longitude',
        'Unit',
        'Potential',
        'Lithology',
        'Comments',
        'Contact_Information'
    ]

    # Ensure all columns exist
    for col in column_order:
        if col not in df.columns:
            df[col] = None

    df = df[column_order]

    # Format numeric columns
    try:
        df['Locality_No'] = pd.to_numeric(df['Locality_No'], errors='coerce')

    except Exception as e:
        print(f"Warning: Error formatting numeric columns: {str(e)}")

    # Set column widths and format
    with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
        df.to_excel(writer, index=False, sheet_name='Geological Data')
        worksheet = writer.sheets['Geological Data']

        # Adjust column widths
        cols = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
        widths = [12, 12, 12, 20, 20, 25, 15, 50, 50, 20]

        for col, width in zip(cols, widths):
            worksheet.column_dimensions[col].width = width

    print(f"\nCreated Excel file with {len(df)} records")
    print("\nColumns in the Excel file:")
    print(df.columns.tolist())

def main():
    # Replace these paths with your actual file paths
    pdf_path = 'example.pdf'
    excel_path = 'output1.xlsx'

    try:
        print(f"Starting extraction from: {pdf_path}")
        data = extract_pdf_data(pdf_path)

        print(f"\nCreating Excel file at: {excel_path}")
        create_excel(data, excel_path)

        print("\nProcessing complete!")

    except FileNotFoundError:
        print(f"Error: Could not find the PDF file at {pdf_path}")
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        print("\nPlease ensure:")
        print("1. The PDF file exists and is readable")
        print("2. The PDF contains text (not scanned images)")
        print("3. The output directory is writable")

if __name__ == "__main__":
    main()