-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathInvertedIndex.java
More file actions
116 lines (101 loc) · 4.02 KB
/
InvertedIndex.java
File metadata and controls
116 lines (101 loc) · 4.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import java.util.Map;
import java.util.HashMap;
import java.io.*;
import java.util.Scanner;
import java.util.HashSet;
class InvertedIndex {
// term -> {doc1 -> term frequency ,doc2 -> term frequency , ...}
public HashMap<String, HashMap<String, Integer>> invertedIndexTFs;
// term -> inverse-document-frequency value
public HashMap<String, Double> termIDFs;
// docName -> vectorNorm value (pre-computed to use in cosine similarity calculations)
// Note !!! this will be pre-computed without taking the square-root (thus its the Norm Squared)
public HashMap<String, Double> docVectorNormsSquared;
// set of all documents
public HashSet<String> corpusDocs;
private static String indexFileName = "index.txt";
public InvertedIndex()
{
invertedIndexTFs = new HashMap<String, HashMap<String, Integer>>();
termIDFs = new HashMap<String, Double>();
docVectorNormsSquared = new HashMap<String, Double>();
corpusDocs = new HashSet<String>();
}
// Expects file with structure:
/* term,doc1,doc1-term-freq,doc2,doc2-term-freq,...,term-inverse-doc-frequency
* term,doc1,doc1-term-freq,doc2,doc2-term-freq,...,term-inverse-doc-frequency
* term,doc1,doc1-term-freq,doc2,doc2-term-freq,...,term-inverse-doc-frequency
* ...
*
* Converts this file into the class data structures:
* - an inverted index with document term frequencies
* - a lookup for term inverse document frequencies (IDFs)
* - pre-computed document vector norms:
* (sum the squares of all tf-idf values for all terms in a doc and take the square root)
*/
public void constructInvertedIndexFromFile(String indexDirectory)
{
try
{
if (!indexDirectory.substring(indexDirectory.length() - 1).equals("/"))
{
indexDirectory = indexDirectory+"/";
}
FileReader fr = new FileReader(indexDirectory+indexFileName);
try
{
Scanner scan = new Scanner(fr);
int lineNumber = 0; //for debug
while (scan.hasNextLine())
{
lineNumber++; //for debug
String line = scan.nextLine(); // Read one line of the text file into a string
String[] parts = line.trim().split(","); // Split the line by space into a String array
if (parts.length > 0)
{
String token = parts[0];
// Take last part off, its the IDF
String idfStr = parts[parts.length - 1];
double idf = Double.parseDouble(idfStr);
termIDFs.put(token, idf);
// now get all the doc,term-frequency pairs
// NOTE!! the for loop variable increments by 2 each iteration
HashMap<String, Integer> docTFs = new HashMap<String, Integer>();
for (int i = 1; i < parts.length - 1; i += 2)
{
String docName = parts[i];
int termFreq = Integer.parseInt(parts[i+1]);
docTFs.put(docName,termFreq);
//build up non-duplicate set of corpus doc names
corpusDocs.add(docName);
//add to document vector norm calculation
double tfIDFSquared = (termFreq*idf)*(termFreq*idf);
if (docVectorNormsSquared.containsKey(parts[i]))
{
docVectorNormsSquared.put(parts[i], docVectorNormsSquared.get(parts[i]) + tfIDFSquared);
} else {
docVectorNormsSquared.put(parts[i], tfIDFSquared);
}
}
invertedIndexTFs.put(token, docTFs);
}
}
//Print vocab size
//System.out.println("Size of vocabulary: "+ invertedIndexTFs.size());
//System.out.println("Number of docs: "+ corpusDocs.size());
}
finally
{
fr.close();
}
}
catch (FileNotFoundException e)
{
System.out.print("File not found\n");
}
catch (IOException e)
{
System.out.print("Unexpected I/O exception\n");
}
}
}