diff --git a/pom.xml b/pom.xml index 8169ff7..7826f26 100644 --- a/pom.xml +++ b/pom.xml @@ -9,11 +9,23 @@ big-data http://maven.apache.org - + + + + org.apache.maven.plugins + maven-compiler-plugin + + 1.8 + 1.8 + + + + UTF-8 + 1.8 + 1.8 - org.apache.mahout diff --git a/src/data/movies.txt.gz b/src/data/movies.txt.gz new file mode 100644 index 0000000..020e8a6 Binary files /dev/null and b/src/data/movies.txt.gz differ diff --git a/src/main/java/movierec/MovieRecommender.java b/src/main/java/movierec/MovieRecommender.java new file mode 100644 index 0000000..260eb45 --- /dev/null +++ b/src/main/java/movierec/MovieRecommender.java @@ -0,0 +1,125 @@ +// this class works with the src/data/movies.txt.gz compressed file as input +// the src/data/movies.txt.gz current file is a provisional one for storage reasons +// so, in order to pass the test properly, it needs to be replaced with the original 3+ GB file +// ... with the correct file, this class generates an intermediate CSV file with clean data, +// which is about 150 MB big + +package movierec; + +import java.io.*; +import java.util.zip.GZIPInputStream; +import java.util.ArrayList; +import java.util.Hashtable; +import java.util.List; + +import org.apache.mahout.cf.taste.common.TasteException; +import org.apache.mahout.cf.taste.impl.model.file.FileDataModel; +import org.apache.mahout.cf.taste.impl.neighborhood.ThresholdUserNeighborhood; +import org.apache.mahout.cf.taste.impl.recommender.GenericUserBasedRecommender; +import org.apache.mahout.cf.taste.impl.similarity.PearsonCorrelationSimilarity; +import org.apache.mahout.cf.taste.model.DataModel; +import org.apache.mahout.cf.taste.neighborhood.UserNeighborhood; +import org.apache.mahout.cf.taste.recommender.RecommendedItem; +import org.apache.mahout.cf.taste.recommender.UserBasedRecommender; +import org.apache.mahout.cf.taste.similarity.UserSimilarity; + +public class MovieRecommender { + int totalProducts; + int totalUsers; + int totalReviews; + // keep hash tables to be able to 'translate' between numeric and alphanum. IDs + Hashtable products; + Hashtable users; + Hashtable productsByID; + String myPath; + String output; + + public MovieRecommender(String pathTidyFile) throws Exception { + this.totalProducts = 0; + this.totalUsers = 0; + this.totalReviews = 0; + this.products = new Hashtable(); + this.users = new Hashtable(); + this.productsByID = new Hashtable(); + this.myPath = pathTidyFile; + this.output = "src/data/nice_data.csv"; + dataWrangling(); + } + + // method for cleaning and re-arranging the data so that it is a valid input + // for the mahout FileDataModel class + public void dataWrangling () throws IOException { + InputStream gzipStream = new GZIPInputStream(new FileInputStream(this.myPath)); + BufferedReader buffered = new BufferedReader(new InputStreamReader(gzipStream)); + FileWriter csvWriter = new FileWriter(this.output); + + String auxLine = ""; + String productStr = "product/productId: "; + String userStr = "review/userId: "; + String scoreStr = "review/score: "; + String user = ""; + String product = ""; + String score = ""; + + while (auxLine != null) { + if (auxLine.contains(userStr)) { + user = auxLine.split(" ")[1]; + if(this.users.get(user) == null ) { + this.totalUsers ++; + this.users.put(user,this.totalUsers); + } + } + if (auxLine.contains(productStr)) { + product = auxLine.split(" ")[1]; + if(this.products.get(product) == null ) { + this.totalProducts ++; + this.products.put(product,this.totalProducts); + this.productsByID.put(this.totalProducts,product); + } + } + if (auxLine.contains(scoreStr)) { + score = auxLine.split(" ")[1]; + this.totalReviews ++; + } + if (user != "" && product != "" && score != "") { + csvWriter.write(this.users.get(user) + "," + this.products.get(product) + "," + score + "\n"); + user = ""; + product = ""; + score = ""; + } + auxLine = buffered.readLine(); + } + + buffered.close(); + csvWriter.close(); + + } + + // this method gets 3 item recommendations as output for a given user as input + public List getRecommendationsForUser(String userID) throws IOException, TasteException{ + DataModel model = new FileDataModel(new File(this.output)); + UserSimilarity similarity = new PearsonCorrelationSimilarity(model); + UserNeighborhood neighborhood = new ThresholdUserNeighborhood(0.1, similarity, model); + UserBasedRecommender recommender = + new GenericUserBasedRecommender(model, neighborhood, similarity); + List recommendations = new ArrayList(); + for (RecommendedItem recommendation : recommender.recommend(this.users.get(userID), 3)) { + recommendations.add(this.productsByID.get((int )(recommendation.getItemID()))); + } + return recommendations; + + } + + public int getTotalReviews() { + return totalReviews; + } + + public int getTotalProducts() { + return totalProducts; + } + + public int getTotalUsers() { + return totalUsers; + } +} + diff --git a/src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommenderTest.java b/src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommenderTest.java index 0d0b1fe..8fb09e3 100644 --- a/src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommenderTest.java +++ b/src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommenderTest.java @@ -1,22 +1,24 @@ +// PLEASE CONSIDER THAT... +// in order to pass the test properly, +// the current rc/data/movies.txt.gz file needs to be replaced with the original 3+ GB file package nearsoft.academy.bigdata.recommendation; -import org.apache.mahout.cf.taste.common.TasteException; +import movierec.MovieRecommender; import org.junit.Test; - -import java.io.IOException; import java.util.List; - import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertThat; import static org.junit.matchers.JUnitMatchers.hasItem; + public class MovieRecommenderTest { @Test - public void testDataInfo() throws IOException, TasteException { + public void testDataInfo() throws Exception { //download movies.txt.gz from // http://snap.stanford.edu/data/web-Movies.html - MovieRecommender recommender = new MovieRecommender("/path/to/movies.txt.gz"); - assertEquals(7911684, recommender.getTotalReviews()); + + MovieRecommender recommender = new MovieRecommender("src/data/movies.txt.gz"); + assertEquals(7911684, recommender.getTotalReviews()); // atributos clase principal assertEquals(253059, recommender.getTotalProducts()); assertEquals(889176, recommender.getTotalUsers()); @@ -24,7 +26,5 @@ public void testDataInfo() throws IOException, TasteException { assertThat(recommendations, hasItem("B0002O7Y8U")); assertThat(recommendations, hasItem("B00004CQTF")); assertThat(recommendations, hasItem("B000063W82")); - } - -} +} \ No newline at end of file