-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathProgram.cs
More file actions
198 lines (173 loc) · 7.61 KB
/
Program.cs
File metadata and controls
198 lines (173 loc) · 7.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
using GroupDocs.Parser.Data;
using GroupDocs.Parser.Options;
using GroupDocs.Parser.Templates;
namespace GroupDocs.Parser.PdfTablesExtraction
{
internal class Program
{
static void Main(string[] args)
{
ExtractTablesPerParticluarPage();
ExtractAllTablesFromDocument();
ExtractTablesWithTemplate();
}
/// <summary>
/// Extracts and displays tables from each page of a PDF document.
/// </summary>
/// <remarks>
/// This method processes a PDF document, extracts all tables from each page,
/// and analyzes each table for its content by rows and columns.
/// </remarks>
static void ExtractTablesPerParticluarPage()
{
string sample = "Invoices.pdf";
Console.WriteLine();
Console.WriteLine($"Extracting tables from: {sample}");
using (var parser = new Parser(sample))
{
var documentInfo = parser.GetDocumentInfo();
int pageCount = documentInfo.PageCount;
Console.WriteLine($"Total pages: {pageCount}");
Console.WriteLine();
// extract tables from first page
var pageIndex = 0;
var tables = parser.GetTables(pageIndex);
if (tables != null && tables.Any())
{
int tableNumber = 1;
foreach (var table in tables)
{
Console.WriteLine($" Table {tableNumber}: {table.RowCount} rows x {table.ColumnCount} columns");
ProcessTable(table);
tableNumber++;
}
}
}
}
/// <summary>
/// Extracts and displays table sizes from the entire PDF document at once.
/// </summary>
static void ExtractAllTablesFromDocument()
{
string sample = "TablesReport.pdf";
Console.WriteLine();
Console.WriteLine($"Extracting tables from: {sample}");
using (var parser = new Parser(sample))
{
var tables = parser.GetTables();
if (tables != null && tables.Any())
{
// Group tables by page index
var tablesByPage = tables
.GroupBy(table => table.Page.Index)
.OrderBy(group => group.Key);
foreach (var pageGroup in tablesByPage)
{
int pageIndex = pageGroup.Key;
Console.WriteLine($"Tables in the Page {pageIndex + 1}");
Console.WriteLine();
int tableNumber = 1;
foreach (var table in pageGroup)
{
Console.WriteLine($" Table {tableNumber}: {table.RowCount} rows x {table.ColumnCount} columns");
ProcessTable(table);
tableNumber++;
}
Console.WriteLine();
}
}
}
}
/// <summary>
/// Extracts tables from a PDF document using a template definition.
/// </summary>
/// <remarks>
/// This method loads a template from an XML file and uses it to extract tables
/// from the specified PDF document. The template defines the table structure
/// including column and row positions for precise extraction.
/// </remarks>
static void ExtractTablesWithTemplate()
{
string sample = "Scanned-tables.pdf";
string templateFile = "Scanned-tables.layout.xml";
Console.WriteLine();
Console.WriteLine($"Extracting tables from: {sample}");
Console.WriteLine($"Using template: {templateFile}");
Console.WriteLine();
using (var parser = new Parser(sample))
{
// Load template from XML file
Template template = Template.Load(templateFile);
// Parse document using template
ParseByTemplateOptions options = new ParseByTemplateOptions(0, true, new OcrOptions(new PagePreviewOptions(288)));
DocumentData data = parser.ParseByTemplate(template, options);
// Extract all table fields from the parsed data
var tableFields = data
.Where(field => field?.PageArea is PageTableArea)
.ToList();
// Group tables by page
var tablesByPage = tableFields
.Where(field => field?.PageArea != null)
.GroupBy(field => field!.PageArea!.Page.Index)
.OrderBy(group => group.Key);
foreach (var pageGroup in tablesByPage)
{
int pageIndex = pageGroup.Key;
Console.WriteLine($"Tables in the Page {pageIndex + 1}");
Console.WriteLine();
int tableNumber = 1;
foreach (var field in pageGroup)
{
if (field.PageArea is PageTableArea table)
{
Console.WriteLine($" Table {tableNumber} (Field: {field.Name}): {table.RowCount} rows x {table.ColumnCount} columns");
ProcessTable(table);
tableNumber++;
}
}
Console.WriteLine();
}
}
}
/// <summary>
/// Outputs the content of each table cell to the console.
/// </summary>
/// <param name="table">The table.</param>
static void ProcessTable(PageTableArea table)
{
// Calculate column widths for proper alignment
int[] columnWidths = Enumerable.Range(0, table.ColumnCount)
.Select(col => Math.Max(3, Enumerable.Range(0, table.RowCount)
.Max(row => table[row, col]?.Text?.Length ?? 0)))
.ToArray();
// Display table with borders
string separator = "+" + string.Join("+", columnWidths.Select(w => new string('-', w + 2))) + "+";
// Display header row (first row)
Console.WriteLine(" " + separator);
Console.Write(" |");
for (int col = 0; col < table.ColumnCount; col++)
{
string cellText = GetCellText(table, 0, col);
Console.Write($" {cellText.PadRight(columnWidths[col])} |");
}
Console.WriteLine();
Console.WriteLine(" " + separator);
// Display data rows
for (int row = 1; row < table.RowCount; row++)
{
Console.Write(" |");
for (int col = 0; col < table.ColumnCount; col++)
{
string cellText = GetCellText(table, row, col);
Console.Write($" {cellText.PadRight(columnWidths[col])} |");
}
Console.WriteLine();
}
Console.WriteLine(" " + separator);
}
static string GetCellText(PageTableArea table, int row, int col)
{
return table[row, col]?.Text ?? "";
}
}
}