msr-ds3 · vanessareino · May 28, 2025 · May 28, 2025 · May 29, 2025 · May 30, 2025
diff --git a/README.html b/README.html
@@ -0,0 +1,334 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
+  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">
+  <head>
+    <meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
+    <style type="text/css">
+      h1 {
+        color:#fc3;
+        font-family:"Lucida Grande",Verdana,sans-serif; 
+        font-size: 150%; 
+        font-weight: normal; 
+        margin:34px 0 0;
+        background-color: #7A0019;
+      }
+      p {
+        margin-left: 20px;
+      }
+      p.file_line_structure {
+        margin-left: 40px;
+      }
+      table {
+        margin-left: 30px;
+      }
+      th {
+        text-align:left;
+      }
+    </style>
+
+    <title>MovieLens 10M/100k Data Set README</title>
+  </head>
+  <body>
+    <h1>
+        Summary
+    </h1>
+    <p>
+      This data set contains 10000054 ratings and 95580 tags 
+      applied to 10681 movies by 71567 users of the 
+      online movie recommender service <a href="http://www.movielens.org">MovieLens</a>.
+    </p>
+    <p>
+      Users were selected at random for inclusion.  All users selected had rated 
+      at least 20 movies.  Unlike previous MovieLens data sets, no demographic 
+      information is included.  Each user is represented by an id, and no other 
+      information is provided.
+    </p>
+
+    <p>
+      The data are contained in three files, <code>movies.dat</code>, 
+      <code>ratings.dat</code> and <code>tags.dat</code>.
+      Also included are scripts for generating subsets of the data to support five-fold
+      cross-validation of rating predictions.  More details about the contents and use
+      of all these files <a href="#file_desc">follows</a>.
+    </p>
+
+    <p>
+      This and other GroupLens data sets are publicly available for download at 
+      <a href="http://www.grouplens.org/taxonomy/term/14">GroupLens Data Sets</a>.
+    </p>
+    <h1>
+      Usage License
+    </h1>
+    <p>
+      Neither the University of Minnesota nor any of the researchers
+      involved can guarantee the correctness of the data, its suitability
+      for any particular purpose, or the validity of results based on the
+      use of the data set.  The data set may be used for any research
+      purposes under the following conditions:
+    </p>
+    <ul>
+        <li>The user may not state or imply any endorsement from the
+       University of Minnesota or the GroupLens Research Group.</li>
+
+        <li>The user must acknowledge the use of the data set in
+	publications resulting from the use of the data set (see below 
+	for citation information).</li>
+
+        <li>The user may not redistribute the data without separate
+       permission.</li>
+
+        <li>The user may not use this information for any commercial or
+       revenue-bearing purposes without first obtaining permission
+       from a faculty member of the GroupLens Research Project at the
+       University of Minnesota.</li>
+    </ul>
+    <p>
+      The executable software scripts are provided "as is" without warranty 
+      of any kind, either expressed or implied, including, but not limited to, 
+      the implied warranties of merchantability and fitness for a particular purpose. 
+      The entire risk as to the quality and performance of them is with you. 
+      Should the program prove defective, you assume the cost of all 
+      necessary servicing, repair or correction.
+    </p>
+    <p>
+      In no event shall the University of Minnesota, its affiliates or employees 
+      be liable to you for any damages arising out of the use or inability to use 
+      these programs (including but not limited to loss of data or data being 
+      rendered inaccurate).
+    </p>
+
+    <p>
+      If you have any further questions or comments, please email <a href='mailto:grouplens-info@cs.umn.edu'>grouplens-info</a>
+    </p>
+
+    <h1>
+      Citation
+    </h1>
+    <p>
+      To acknowledge use of the dataset in publications, please cite the
+      following paper:
+    </p>
+    <p>
+      F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets:
+      History and Context. ACM Transactions on Interactive Intelligent
+      Systems (TiiS) 5, 4, Article 19 (December 2015), 19 pages.
+      DOI=<a href="http://dx.doi.org/10.1145/2827872">http://dx.doi.org/10.1145/2827872</a>
+    </p>
+
+    <h1>
+        Acknowledgements
+    </h1>
+    <p>
+      Thanks to Rich Davies for generating the data set.
+    </p>
+
+    <h1>
+        Further Information About GroupLens
+    </h1>
+    <p>
+      <a href="http://www.grouplens.org/">GroupLens</a> is a research group in the 
+      <a href="http://www.cs.umn.edu/">Department of Computer Science and Engineering</a>
+      at the <a href="http://www.umn.edu/">University of Minnesota</a>.  Since its 
+      inception in 1992, GroupLens' research projects have explored a variety of fields
+      including: 
+    </p>
+    <ul>
+      <li>Information Filtering</li>
+      <li>Recommender Systems</li>
+      <li>Online Communities</li>
+      <li>Mobile and Ubiquitious Technologies</li>
+      <li>Digital Libraries</li>
+      <li>Local Geographic Information Systems.</li>
+    </ul>
+    <p>
+      GroupLens Research operates a movie recommender based on
+      collaborative filtering, <a href="http://www.movielens.org/">MovieLens</a>,
+      which is the source of these data.
+    </p>
+
+    <h1 id="file_desc">
+      Content and Use of Files
+    </h1>
+
+    <h2>
+      Character Encoding
+    </h2>
+    <p>
+      The three data files are encoded as 
+      <a href="http://en.wikipedia.org/wiki/Utf-8">UTF-8</a>.  This is a departure
+      from previous MovieLens data sets, which used different character encodings.
+      If accented characters in movie titles or tag values (e.g. Misérables, Les (1995))
+      display incorrectly, make sure that any program reading the data, such as a 
+      text editor, terminal, or script, is configured for UTF-8.
+    </p>
+
+    <h2>
+      User Ids
+    </h2>
+    <p>
+      Movielens users were selected at random for inclusion.  Their ids have been 
+      anonymized.
+    </p>
+    <p>
+      Users were selected separately for inclusion 
+      in the ratings and tags data sets, which implies that user ids may appear in 
+      one set but not the other.
+    </p>
+    <p>
+      The anonymized values are consistent between the ratings and tags data files.  
+      That is, user id <em>n</em>, if it appears in both files, refers to the same
+      real MovieLens user.
+    </p>
+
+    <h2>
+      Ratings Data File Structure
+    </h2>
+    <p>
+      All ratings are contained in the file <code>ratings.dat</code>.  Each line of this 
+      file represents one rating of one movie by one user, and has the following format:
+    </p>
+    <p class="file_line_structure">
+      <code>UserID::MovieID::Rating::Timestamp</code>
+    </p>
+    <p>
+      The lines within this file are ordered first by UserID, then, within user, 
+      by MovieID.
+    </p>
+    <p>
+      Ratings are made on a 5-star scale, with half-star increments.
+    </p>
+    <p>
+      <a href="http://en.wikipedia.org/wiki/Unix_time">Timestamps</a> represent 
+      seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970.
+    </p>
+
+    <h2>
+      Tags Data File Structure
+    </h2>
+    <p>
+      All tags are contained in the file <code>tags.dat</code>.  Each line of this 
+      file represents one tag applied to one movie by one user, and has 
+      the following format:
+    </p>
+    <p class="file_line_structure">
+      <code>UserID::MovieID::Tag::Timestamp</code>
+    </p>
+    <p>
+      The lines within this file are ordered first by UserID, then, within user, 
+      by MovieID.
+    </p>
+    <p>
+      <a href="http://en.wikipedia.org/wiki/Tag_(metadata)">Tags</a> are user 
+      generated metadata about movies.  Each tag is typically a single word, or
+      short phrase.  The meaning, value and purpose of a particular tag is 
+      determined by each user.
+    </p>
+    <p>
+      <a href="http://en.wikipedia.org/wiki/Unix_time">Timestamps</a> represent 
+      seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970.
+    </p>
+
+    <h2>
+      Movies Data File Structure
+    </h2>
+    <p>
+      Movie information is contained in the file <code>movies.dat</code>.  
+      Each line of this file represents one movie, and has the following format:
+    </p>
+    <p class="file_line_structure">
+      <code>MovieID::Title::Genres</code>
+    </p>
+    <p>
+      MovieID is the real MovieLens id.
+    </p>
+    <p>
+      Movie titles, by policy, should be entered identically to those
+      found in <a href="http://www.imdb.com/">IMDB</a>, including year of release.
+      However, they are entered manually, so errors and inconsistencies may exist.
+    </p>
+    <p>
+      Genres are a pipe-separated list, and are selected from the following:
+    </p>
+    <ul>
+	  <li>Action</li>
+	  <li>Adventure</li>
+	  <li>Animation</li>
+	  <li>Children's</li>
+	  <li>Comedy</li>
+	  <li>Crime</li>
+	  <li>Documentary</li>
+	  <li>Drama</li>
+	  <li>Fantasy</li>
+	  <li>Film-Noir</li>
+	  <li>Horror</li>
+	  <li>Musical</li>
+	  <li>Mystery</li>
+	  <li>Romance</li>
+	  <li>Sci-Fi</li>
+	  <li>Thriller</li>
+	  <li>War</li>
+	  <li>Western</li>
+    </ul>
+
+    <h2>
+      Cross-Validation Subset Generation Scripts
+    </h2>
+    <p>
+      A Unix shell script, <code>split_ratings.sh</code>, is provided that, if desired, 
+      can be used to split the ratings data for five-fold cross-validation
+      of rating predictions.  It depends on a second script, allbut.pl, which 
+      is also included and is written in Perl.  They should run without modification
+      under Linux, Mac OS X, Cygwin or other Unix like systems.
+    </p>
+    <p>
+      Running <code>split_ratings.sh</code> will use <code>ratings.dat</code>
+      as input, and produce the fourteen output files described below.  Multiple
+      runs of the script will produce identical results.
+    </p>
+    <table style="width:75%" border="1">
+      <tr>
+        <th style="width:25%">File Names</th>
+        <th>Description</th>
+      </tr>
+      <tr>
+        <td>
+          r1.train, r2.train, r3.train, r4.train, r5.train<br/>
+          r1.test, r2.test, r3.test, r4.test, r5.test<br/>
+         </td>
+         <td>
+           The data sets r1.train and r1.test through r5.train and r5.test
+           are 80%/20% splits of the ratings data into training and test data.
+           Each of r1, ..., r5 have disjoint test sets; this if for
+           5 fold cross validation (where you repeat your experiment
+           with each training and test set and average the results).
+        </td>
+      </tr>
+      <tr>
+        <td>
+          ra.train, rb.train<br/>
+          ra.test, rb.test<br/>
+         </td>
+        <td>
+          The data sets ra.train, ra.test, rb.train, and rb.test
+          split the ratings data into a training set and a test set with
+          exactly 10 ratings per user in the test set.  The sets
+          ra.test and rb.test are disjoint.
+         </td>
+      </tr>
+    </table>
+    <p style="text-align:right">
+      <a href="http://validator.w3.org/check?uri=referer">
+        <img style="border:0;width:88px;height:31px"
+          src="http://www.w3.org/Icons/valid-xhtml10"
+          alt="Valid XHTML 1.0 Strict" height="31" width="88" />
+      </a>
+
+      <a href="http://jigsaw.w3.org/css-validator/">
+        <img style="border:0;width:88px;height:31px"
+          src="http://jigsaw.w3.org/css-validator/images/vcss"
+          alt="Valid CSS!" />
+      </a>
+    </p>
+  </body>
+</html>
+
diff --git a/allbut.pl b/allbut.pl
@@ -0,0 +1,35 @@
+#!/usr/bin/env perl
+
+# get args
+if (@ARGV < 3) {
+	print STDERR "Usage: $0 base_name start stop max_test [ratings ...]\n";
+	exit 1;
+}
+$basename = shift;
+$start = shift;
+$stop = shift;
+$maxtest = shift;
+
+# open files
+open( TESTFILE, ">$basename.test" ) or die "Cannot open $basename.test for writing\n";
+open( BASEFILE, ">$basename.train" ) or die "Cannot open $basename.train for writing\n";
+
+# init variables
+$testcnt = 0;
+
+while (<>) {
+	($user) = split /::/, $_, 2;
+	if (! defined $ratingcnt{$user}) {
+		$ratingcnt{$user} = 1;
+	} else {
+	        ++$ratingcnt{$user};
+        }
+	if (($testcnt < $maxtest || $maxtest <= 0)
+	&& $ratingcnt{$user} >= $start && $ratingcnt{$user} <= $stop) {
+		++$testcnt;
+		print TESTFILE;
+	}
+	else {
+		print BASEFILE;
+	}
+}