DocUtility/DocUtility.java at master · tarunwalia/DocUtility · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
package com.mywork.services;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.hwpf.usermodel.HeaderStories;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy;
import org.apache.poi.xwpf.usermodel.IBodyElement;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFFooter;
import org.apache.poi.xwpf.usermodel.XWPFHeader;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFRun;
import org.apache.poi.xwpf.usermodel.XWPFTable;
import org.apache.poi.xwpf.usermodel.XWPFTableCell;
import org.apache.poi.xwpf.usermodel.XWPFTableRow;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;;

/**
 * @author tarun
 *
 */
public class DocUtility {
	private WordExtractor wordExtractor;
	private HWPFDocument doc;
	private HWPFDocument targetDoc;
	private XWPFDocument xDoc;
	private XWPFDocument targetxDoc;
	private int pageCount;
	private int targetPageCount;

	/**
	 * DocUtility constructor is used to initialize the doc/docx object
	 *
	 * @param filePath
	 * @throws IOException
	 * @throws InvalidFormatException
	 */
	public DocUtility(String filePath) throws IOException, InvalidFormatException {
		String fileExt = getFileExtension(filePath);
		File file = new File(filePath);
		FileInputStream fis = new FileInputStream(file);
		if ("doc".equalsIgnoreCase(fileExt)) {
			doc = new HWPFDocument(fis);
			getPageCount();
		} else if ("docx".equalsIgnoreCase(fileExt)) {
			xDoc = new XWPFDocument(OPCPackage.open(fis));
			getPageCount();
		} else {
			throw new FileNotFoundException("File extension is not as expected. It should be doc or docx.");
		}
	}

	/**
	 * this method will return the count of number of pages of input doc file
	 *
	 * @return int
	 */
	public int getPageCount() {
		pageCount = (null != xDoc) ? xDoc.getProperties().getExtendedProperties().getUnderlyingProperties().getPages()
				: doc.getSummaryInformation().getPageCount();
		return pageCount;
		/*
		 * if (null != xDoc) { int pageC =
		 * xDoc.getProperties().getExtendedProperties().getUnderlyingProperties().
		 * getPages(); return pageC; } else { pageCount =
		 * doc.getSummaryInformation().getPageCount(); return pageCount; }
		 */
	}

	/**
	 * this method will return the doc text as string
	 *
	 * @return String
	 */
	public String getText() {
		if (null != xDoc) {
			String text = new XWPFWordExtractor(xDoc).getText();
			return text;
		} else {
			wordExtractor = new WordExtractor(doc);
			String text = wordExtractor.getText();
			return text;
		}
	}

	/**
	 * this method will return the table elements as list for input file
	 *
	 * @param tableNumber
	 * @return list
	 */
	public List<List<String>> getTableData(int tableNumber) {
		Iterator<IBodyElement> docIterator = xDoc.getBodyElementsIterator();
		String text = "";
		List<List<String>> list = new ArrayList<List<String>>();
		while (docIterator.hasNext()) {
			IBodyElement ele = docIterator.next();
			if ("TABLE".equalsIgnoreCase(ele.getElementType().name())) {
				List<XWPFTable> tableList = ele.getBody().getTables();
				XWPFTable xTable = tableList.get(tableNumber);
				for (int i = 0; i < xTable.getRows().size(); i++) {
					List<String> l = new ArrayList<String>();
					for (int j = 0; j < xTable.getRow(i).getTableCells().size(); j++) {
						text = xTable.getRow(i).getCell(j).getText().trim();
						l.add(text);
					}
					list.add(l);
				}
			}
		}
		return list;
	}

	/**
	 * this method is used to compare the source doc to target doc file
	 *
	 * @param targetDoc
	 * @return boolean
	 * @throws IOException
	 * @throws InvalidFormatException
	 */
	public boolean compareDoc(String targetDoc) throws IOException, InvalidFormatException {
		boolean isDocMatching = false;
		String sourceDocText = getText();
		String targetDocText = getTargetDocText(targetDoc);
		if (pageCount == targetPageCount) {
			if (sourceDocText.equals(targetDocText)) {
				isDocMatching = true;
				return isDocMatching;
			}
		}
		return isDocMatching;
	}

	/**
	 * this method will search the input string in doc file
	 *
	 * @param searchString
	 * @return boolean
	 */
	public boolean searchText(String searchString) {
		boolean textAvailable = false;
		String docText = getText();
		if (docText.toLowerCase().contains(searchString.toLowerCase())) {
			textAvailable = true;
			return textAvailable;
		}
		return textAvailable;
	}

	/*
	 * public void replaceText(String source, String target, String outputFilePath)
	 * throws IOException { for (XWPFParagraph p : xDoc.getParagraphs()) {
	 * List<XWPFRun> runs = p.getRuns(); if (runs != null) { for (XWPFRun r : runs)
	 * { String text = r.getText(0); if (text != null && text.contains(source)) {
	 * text = text.replace(source, target); r.setText(text, 0); } } } }
	 *
	 * for (XWPFTable tbl : xDoc.getTables()) { for (XWPFTableRow row :
	 * tbl.getRows()) { for (XWPFTableCell cell : row.getTableCells()) { for
	 * (XWPFParagraph p : cell.getParagraphs()) { for (XWPFRun r : p.getRuns()) {
	 * String text = r.getText(0); if (text != null && text.contains("2011-2015")) {
	 * text = text.replace("2011-2015", "2009-2015"); r.setText(text, 0); } } } } }
	 * } xDoc.write(new FileOutputStream(outputFilePath)); }
	 */

	/**
	 * @param targetDocPath
	 * @return
	 * @throws IOException
	 * @throws InvalidFormatException
	 */
	private String getTargetDocText(String targetDocPath) throws IOException, InvalidFormatException {
		String fileExt = getFileExtension(targetDocPath);
		File file = new File(targetDocPath);
		FileInputStream fis = new FileInputStream(file);
		if ("doc".equalsIgnoreCase(fileExt)) {
			targetDoc = new HWPFDocument(fis);
			targetPageCount = targetDoc.getSummaryInformation().getPageCount();
			wordExtractor = new WordExtractor(targetDoc);
			String text = wordExtractor.getText();
			return text;
		} else if ("docx".equalsIgnoreCase(fileExt)) {
			targetxDoc = new XWPFDocument(OPCPackage.open(fis));
			targetPageCount = targetxDoc.getProperties().getExtendedProperties().getUnderlyingProperties().getPages();
			String text = new XWPFWordExtractor(targetxDoc).getText();
			return text;
		} else {
			throw new FileNotFoundException("File extension is not as expected. It should be doc or docx.");
		}
	}

	/**
	 * this method will return the header text of input file
	 *
	 * @return String
	 */
	public String getHeaderText() {
		if (null != xDoc) {
			XWPFHeaderFooterPolicy xfPolicy = new XWPFHeaderFooterPolicy(xDoc);
			XWPFHeader xfHeader = xfPolicy.getDefaultHeader();
			if (null != xfHeader) {
				return xfHeader.getText();
			}
		} else {
			HeaderStories hStories = new HeaderStories(doc);
			String header = hStories.getHeader(this.pageCount);
			return header;
		}
		return null;
	}

	/**
	 * this method will return the footer text of input file
	 *
	 * @return String
	 */
	public String getFooterText() {
		if (null != xDoc) {
			XWPFHeaderFooterPolicy xfPolicy = new XWPFHeaderFooterPolicy(xDoc);
			XWPFFooter xfFooter = xfPolicy.getDefaultFooter();
			if (null != xfFooter) {
				return xfFooter.getText();
			}
		} else {
			HeaderStories hStories = new HeaderStories(doc);
			String footer = hStories.getFooter(this.pageCount);
			return footer;
		}
		return null;
	}

	/**
	 * this method is used to get the file extension
	 *
	 * @param filePath
	 * @return
	 */
	private String getFileExtension(String filePath) {
		String fileExtesion = filePath.substring(filePath.length() - 4);
		fileExtesion = fileExtesion.contains(".") ? filePath.substring(filePath.length() - 3) : fileExtesion;
		return fileExtesion;
	}

}