upload 410

chmx0929 · May 17, 2017 · a3ea9c6 · a3ea9c6
1 parent 7ccd2fd
commit a3ea9c6
Show file tree

Hide file tree

Showing 321 changed files with 317,339 additions and 0 deletions.
diff --git a/410TextInformationSystems/MPs/cs410/assign4/exp/qrel b/410TextInformationSystems/MPs/cs410/assign4/exp/qrel
diff --git a/410TextInformationSystems/MPs/cs410/assign4/obj/ComputeDocLen$MapClass.class b/410TextInformationSystems/MPs/cs410/assign4/obj/ComputeDocLen$MapClass.class
diff --git a/410TextInformationSystems/MPs/cs410/assign4/obj/ComputeDocLen$Reduce.class b/410TextInformationSystems/MPs/cs410/assign4/obj/ComputeDocLen$Reduce.class
diff --git a/410TextInformationSystems/MPs/cs410/assign4/obj/ComputeDocLen.class b/410TextInformationSystems/MPs/cs410/assign4/obj/ComputeDocLen.class
diff --git a/410TextInformationSystems/MPs/cs410/assign4/obj/InvertedIndex$MapClass.class b/410TextInformationSystems/MPs/cs410/assign4/obj/InvertedIndex$MapClass.class
diff --git a/410TextInformationSystems/MPs/cs410/assign4/obj/InvertedIndex$Reduce.class b/410TextInformationSystems/MPs/cs410/assign4/obj/InvertedIndex$Reduce.class
diff --git a/410TextInformationSystems/MPs/cs410/assign4/obj/InvertedIndex.class b/410TextInformationSystems/MPs/cs410/assign4/obj/InvertedIndex.class
diff --git a/410TextInformationSystems/MPs/cs410/assign4/simir.jar b/410TextInformationSystems/MPs/cs410/assign4/simir.jar
diff --git a/410TextInformationSystems/MPs/cs410/assign4/src/ComputeDocLen.java b/410TextInformationSystems/MPs/cs410/assign4/src/ComputeDocLen.java
@@ -0,0 +1,156 @@
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.StringTokenizer;
+
+import java.util.HashMap;
+
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+
+/**
+ * This is an Hadoop Map/Reduce application that computes document lengths based on 
+ * the "raw inverted index" (i.e., output generated by "InvertedIndex").  
+ *
+ * To run: hadoop jar simir.jar ComputeDocLen 
+ *            [-m <i>maps</i>] [-r <i>reduces</i>] <i>in-dir</i> <i>out-dir</i> 
+ *   "in-dir" has all the raw inverted index files generated by "InvertedIndex"
+ *   "out-dir" is the directory to put the document length table. 
+ */
+public class ComputeDocLen extends Configured implements Tool {
+
+  /**
+   *
+   * For each line of input, skip the first string (the term), then read each pair (docID, termCount),
+   * and emit (<b>docID</b>, <b>termCount</b>).
+   */
+  public static class MapClass extends MapReduceBase
+    implements Mapper<LongWritable, Text, Text, Text> {
+
+    private final static IntWritable one = new IntWritable(1);
+    private Text word = new Text();
+      private Text did = new Text(); 
+
+    public void map(LongWritable key, Text value, 
+                    OutputCollector<Text, Text> output, 
+                    Reporter reporter) throws IOException {
+      String line = value.toString();
+      StringTokenizer itr = new StringTokenizer(line);
+      String docID ="";
+      String term ="";
+
+      if (itr.hasMoreTokens()) {
+	  term=itr.nextToken();
+	  while (itr.hasMoreTokens()) {
+	      docID = itr.nextToken(); 
+	      did.set(docID);
+	      word.set(itr.nextToken());
+	      output.collect(did,word);
+	  }
+      }
+    }
+  }
+
+  /**
+   * A reducer class that just emits the sum of the input values.
+   */
+  public static class Reduce extends MapReduceBase
+    implements Reducer<Text, Text, Text, Text> {
+
+      Text s= new Text(); 
+    public void reduce(Text key, Iterator<Text> values,
+                       OutputCollector<Text, Text> output, 
+                       Reporter reporter) throws IOException {
+	String sum = "";
+	int count=0;
+	while (values.hasNext()) {
+	    count = count + Integer.parseInt(values.next().toString().trim()) ;
+	}
+	Text t = new Text(); 
+	t.set(Integer.toString(count));
+	output.collect(key,t);
+    }
+  }
+
+  static int printUsage() {
+    System.out.println("ComputeDocLen [-m <maps>] [-r <reduces>] <input> <output>");
+    ToolRunner.printGenericCommandUsage(System.out);
+    return -1;
+  }
+
+  /**
+   * The main driver for ComputeDocLen map/reduce program.
+   * Invoke this method to submit the map/reduce job.
+   * @throws IOException When there is communication problems with the 
+   *                     job tracker.
+   */
+  public int run(String[] args) throws Exception {
+    JobConf conf = new JobConf(getConf(), ComputeDocLen.class);
+    conf.setJobName("computerdoclength");
+
+    // the keys are words (strings)
+    conf.setOutputKeyClass(Text.class);
+    // the values are strings too
+    conf.setOutputValueClass(Text.class);
+
+    conf.setMapperClass(MapClass.class);        
+    conf.setCombinerClass(Reduce.class);
+    conf.setReducerClass(Reduce.class);
+
+    List<String> other_args = new ArrayList<String>();
+    for(int i=0; i < args.length; ++i) {
+      try {
+        if ("-m".equals(args[i])) {
+          conf.setNumMapTasks(Integer.parseInt(args[++i]));
+        } else if ("-r".equals(args[i])) {
+          conf.setNumReduceTasks(Integer.parseInt(args[++i]));
+        } else {
+          other_args.add(args[i]);
+        }
+      } catch (NumberFormatException except) {
+        System.out.println("ERROR: Integer expected instead of " + args[i]);
+        return printUsage();
+      } catch (ArrayIndexOutOfBoundsException except) {
+        System.out.println("ERROR: Required parameter missing from " +
+                           args[i-1]);
+        return printUsage();
+      }
+    }
+    // Make sure there are exactly 2 parameters left.
+    if (other_args.size() != 2) {
+      System.out.println("ERROR: Wrong number of parameters: " +
+                         other_args.size() + " instead of 2.");
+      return printUsage();
+    }
+    FileInputFormat.setInputPaths(conf, other_args.get(0));
+    FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1)));
+
+    JobClient.runJob(conf);
+    return 0;
+  }
+
+
+  public static void main(String[] args) throws Exception {
+    int res = ToolRunner.run(new Configuration(), new ComputeDocLen(), args);
+    System.exit(res);
+  }
+
+}
diff --git a/410TextInformationSystems/MPs/cs410/assign4/src/IndexGeneration.java b/410TextInformationSystems/MPs/cs410/assign4/src/IndexGeneration.java
@@ -0,0 +1,122 @@
+import java.io.*; 
+import java.nio.ByteBuffer;
+import java.nio.channels.FileChannel; 
+import java.util.*;
+
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+
+/// This application takes the raw inverted index generated from InvertedIndex 
+/// and generates two index files: (1) term lexicon with basic term statistics and
+/// pointers to the posting files. (2) postings.  
+/// All the files are HDFS files, so they can potentially support indexing large collections
+///
+/// The program goes through the original raw inverted index generated from InvertedIndex sequentially
+/// and rewrite the postings to a new file and record the starting points of entries for each term
+/// in the term lexicon.
+///
+/// Note that the generated posting file is actually very similar to the original inverted index file,
+/// so we could have kept the original inverted index file as the posting file, but in a polished
+/// inverted index, we would represent everything as integers and compress them, so this extra 
+/// step is conceptually necessary. 
+///
+/// usage: hadoop jar simir.jar IndexGeneration Path-to-rawPosting IndexFileName
+/// "path-to-rawposting" points to a raw inverted index/posting file generated by InvertedIndex
+/// (the current implementation can only take one file)
+///  "IndexFileName" is the name (including the path) for the inverted index to be created.
+/// Two files will be generated: "IndexFileName.lex" for the lexicon and "IndexFileName.pos" for postings.
+
+public class IndexGeneration {
+
+    public static void main (String [] args) throws IOException {
+
+	/// the following is basic setup needed to access HDFS files
+	Configuration conf = new Configuration();
+	conf.addResource(new Path("/hadoop/conf/hadoop-default.xml"));
+	conf.addResource(new Path("/hadoop/conf/hadoop-site.xml"));
+	FileSystem fs = FileSystem.get(conf);
+	FSDataInputStream fin;
+	FSDataOutputStream foutposting, foutlexicon; 
+
+	try { 
+	    fin = fs.open(new Path(args[0] ));  // args[0] has the path to the raw inverted index
+	    foutposting = fs.create(new Path(args[1] + ".pos" )); // posting file with name in args[1]
+	    foutlexicon = fs.create(new Path(args[1] + ".lex")); // term lexicon with name in args[1]
+	    char c;
+	    String t=null;
+	    int progress=0;
+	    int freq;
+	    BufferedReader reader = new BufferedReader(new InputStreamReader(fin));
+	    while ((t=reader.readLine()) != null) {
+		// each line corresponds to all the entries for a different term
+		// it starts with the term itself with a sequence of (docID, termFreq) pairs 
+		// representing the documents containing the term as well as the corresponding term counts. 
+		StringTokenizer st = new StringTokenizer(t);
+		foutlexicon.writeUTF(st.nextToken());  // fetch the first string (which is the term) and
+		// write it to the lexicon
+
+		int df=0; 
+		int count=0;
+		long pos = foutposting.getPos(); 
+		// remember the current position in the new posting file  
+		// so that we can easily calculate the span of the postings for this term later
+
+		while (st.hasMoreTokens()) {
+		    // iterate over all the (docID count) pairs and copy them to foutposting. 
+		    // first, copy the docID using foutposting.writeUTF. 
+
+		    foutposting.writeUTF(st.nextToken());
+
+		    if (st.hasMoreTokens()) {
+			// we should expect another token for the term frequency/count
+			freq = Integer.parseInt(st.nextToken().trim()); 
+			//#########################################################//
+			// add a statement here so that in the end of the loop "count" would have the total 
+			// count of the term in all the documents
+			// Hint: how to update "count"? 
+			// 
+			//#########################################################//
+			count = count + freq;
+			foutposting.writeInt(freq); // copy the term frequency/count to foutposting
+		    } else {
+			System.err.println("Term frequency is expected");
+		    }
+
+		    //#########################################################//
+		    // add a statement here to use "df" to count how many documents contain the term 
+		    // Hint: how to update "df"? 
+		    // 
+		    //#########################################################//
+		    df = df + 1;
+		}
+		int len= new Long(foutposting.getPos()-pos).intValue(); // this tells us the span of the postering entries for this term
+
+		// the following four statements write out df, count, pos, and len to foutlexicon 
+		// recall that the term was already written to foutlexicon. 
+		foutlexicon.writeInt(df);
+		foutlexicon.writeInt(count);
+		foutlexicon.writeLong(pos);
+		foutlexicon.writeInt(len);
+		progress++; 
+		if (progress % 5000 ==0) {
+		    System.out.println(progress + " terms processed"); 
+		}
+	    }
+	    foutlexicon.close();
+	    foutposting.close(); 
+	} catch (IOException ioe) {
+	    System.out.println("can't open file "+args[0] + " or can't create the term index lexicon:"  + args[1]); 
+            System.exit(1);
+	}
+
+
+    }
+
+}
+
+