InformationRetrieval/SearchFiles.java at master · VishnuPriyaChandraSekar/InformationRetrieval · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
package edu.asu.irs13;

import org.apache.lucene.index.*;
import org.apache.lucene.store.*;
import org.apache.lucene.document.*;
import java.io.File;
import java.util.Scanner;

public class SearchFiles {
	public static void main(String[] args) throws Exception
	{
		// the IndexReader object is the main handle that will give you
		// all the documents, terms and inverted index
		IndexReader r = IndexReader.open(FSDirectory.open(new File("index")));

		// You can figure out the number of documents using the maxDoc() function
		System.out.println("The number of documents in this index is: " + r.maxDoc());

		int i = 0;
		// You can find out all the terms that have been indexed using the terms() function
		TermEnum t = r.terms();
		while(t.next())
		{
			// Since there are so many terms, let us try printing only term #100000-#100010
			if (i > 100000) System.out.println("["+i+"] " + t.term().text());
			if (++i > 100010) break;
		}

		// You can create your own query terms by calling the Term constructor, with the field 'contents'
		// In the following example, the query term is 'brute'
		Term te = new Term("contents", "brute");

		// You can also quickly find out the number of documents that have term t
		System.out.println("Number of documents with the word 'brute' is: " + r.docFreq(te));

		// You can use the inverted index to find out all the documents that contain the term 'brute'
		//  by using the termDocs function
		TermDocs td = r.termDocs(te);
		while(td.next())
		{
			System.out.println("Document number ["+td.doc()+"] contains the term 'brute' " + td.freq() + " time(s).");
		}

		// You can find the URL of the a specific document number using the document() function
		// For example, the URL for document number 14191 is:
		Document d = r.document(14191);
		String url = d.getFieldable("path").stringValue(); // the 'path' field of the Document object holds the URL
		System.out.println(url.replace("%%", "/"));


		// -------- Now let us use all of the functions above to make something useful --------
		// The following bit of code is a worked out example of how to get a bunch of documents
		// in response to a query and show them (without ranking them according to TF/IDF)
		Scanner sc = new Scanner(System.in);
		String str = "";
		System.out.print("query> ");
		while(!(str = sc.nextLine()).equals("quit"))
		{
			String[] terms = str.split("\\s+");
			for(String word : terms)
			{
				Term term = new Term("contents", word);
				TermDocs tdocs = r.termDocs(term);
				while(tdocs.next())
				{
					String d_url = r.document(tdocs.doc()).getFieldable("path").stringValue().replace("%%", "/");
					System.out.println("["+tdocs.doc()+"] " + d_url);
				}
			}
			System.out.print("query> ");
		}
		sc.close();
	}
}