Skip to content

Commit

Permalink
upload 410
Browse files Browse the repository at this point in the history
  • Loading branch information
Hao Wang committed May 17, 2017
1 parent 7ccd2fd commit a3ea9c6
Show file tree
Hide file tree
Showing 321 changed files with 317,339 additions and 0 deletions.
3,720 changes: 3,720 additions & 0 deletions 410TextInformationSystems/MPs/cs410/assign4/exp/qrel

Large diffs are not rendered by default.

Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
156 changes: 156 additions & 0 deletions 410TextInformationSystems/MPs/cs410/assign4/src/ComputeDocLen.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@

import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.StringTokenizer;

import java.util.HashMap;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

/**
* This is an Hadoop Map/Reduce application that computes document lengths based on
* the "raw inverted index" (i.e., output generated by "InvertedIndex").
*
* To run: hadoop jar simir.jar ComputeDocLen
* [-m <i>maps</i>] [-r <i>reduces</i>] <i>in-dir</i> <i>out-dir</i>
* "in-dir" has all the raw inverted index files generated by "InvertedIndex"
* "out-dir" is the directory to put the document length table.
*/
public class ComputeDocLen extends Configured implements Tool {

/**
*
* For each line of input, skip the first string (the term), then read each pair (docID, termCount),
* and emit (<b>docID</b>, <b>termCount</b>).
*/
public static class MapClass extends MapReduceBase
implements Mapper<LongWritable, Text, Text, Text> {

private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
private Text did = new Text();

public void map(LongWritable key, Text value,
OutputCollector<Text, Text> output,
Reporter reporter) throws IOException {
String line = value.toString();
StringTokenizer itr = new StringTokenizer(line);
String docID ="";
String term ="";

if (itr.hasMoreTokens()) {
term=itr.nextToken();
while (itr.hasMoreTokens()) {
docID = itr.nextToken();
did.set(docID);
word.set(itr.nextToken());
output.collect(did,word);
}
}
}
}

/**
* A reducer class that just emits the sum of the input values.
*/
public static class Reduce extends MapReduceBase
implements Reducer<Text, Text, Text, Text> {

Text s= new Text();
public void reduce(Text key, Iterator<Text> values,
OutputCollector<Text, Text> output,
Reporter reporter) throws IOException {
String sum = "";
int count=0;
while (values.hasNext()) {
count = count + Integer.parseInt(values.next().toString().trim()) ;
}
Text t = new Text();
t.set(Integer.toString(count));
output.collect(key,t);
}
}

static int printUsage() {
System.out.println("ComputeDocLen [-m <maps>] [-r <reduces>] <input> <output>");
ToolRunner.printGenericCommandUsage(System.out);
return -1;
}

/**
* The main driver for ComputeDocLen map/reduce program.
* Invoke this method to submit the map/reduce job.
* @throws IOException When there is communication problems with the
* job tracker.
*/
public int run(String[] args) throws Exception {
JobConf conf = new JobConf(getConf(), ComputeDocLen.class);
conf.setJobName("computerdoclength");

// the keys are words (strings)
conf.setOutputKeyClass(Text.class);
// the values are strings too
conf.setOutputValueClass(Text.class);

conf.setMapperClass(MapClass.class);
conf.setCombinerClass(Reduce.class);
conf.setReducerClass(Reduce.class);

List<String> other_args = new ArrayList<String>();
for(int i=0; i < args.length; ++i) {
try {
if ("-m".equals(args[i])) {
conf.setNumMapTasks(Integer.parseInt(args[++i]));
} else if ("-r".equals(args[i])) {
conf.setNumReduceTasks(Integer.parseInt(args[++i]));
} else {
other_args.add(args[i]);
}
} catch (NumberFormatException except) {
System.out.println("ERROR: Integer expected instead of " + args[i]);
return printUsage();
} catch (ArrayIndexOutOfBoundsException except) {
System.out.println("ERROR: Required parameter missing from " +
args[i-1]);
return printUsage();
}
}
// Make sure there are exactly 2 parameters left.
if (other_args.size() != 2) {
System.out.println("ERROR: Wrong number of parameters: " +
other_args.size() + " instead of 2.");
return printUsage();
}
FileInputFormat.setInputPaths(conf, other_args.get(0));
FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1)));

JobClient.runJob(conf);
return 0;
}


public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new ComputeDocLen(), args);
System.exit(res);
}

}
122 changes: 122 additions & 0 deletions 410TextInformationSystems/MPs/cs410/assign4/src/IndexGeneration.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
import java.io.*;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.util.*;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;


/// This application takes the raw inverted index generated from InvertedIndex
/// and generates two index files: (1) term lexicon with basic term statistics and
/// pointers to the posting files. (2) postings.
/// All the files are HDFS files, so they can potentially support indexing large collections
///
/// The program goes through the original raw inverted index generated from InvertedIndex sequentially
/// and rewrite the postings to a new file and record the starting points of entries for each term
/// in the term lexicon.
///
/// Note that the generated posting file is actually very similar to the original inverted index file,
/// so we could have kept the original inverted index file as the posting file, but in a polished
/// inverted index, we would represent everything as integers and compress them, so this extra
/// step is conceptually necessary.
///
/// usage: hadoop jar simir.jar IndexGeneration Path-to-rawPosting IndexFileName
/// "path-to-rawposting" points to a raw inverted index/posting file generated by InvertedIndex
/// (the current implementation can only take one file)
/// "IndexFileName" is the name (including the path) for the inverted index to be created.
/// Two files will be generated: "IndexFileName.lex" for the lexicon and "IndexFileName.pos" for postings.

public class IndexGeneration {

public static void main (String [] args) throws IOException {

/// the following is basic setup needed to access HDFS files
Configuration conf = new Configuration();
conf.addResource(new Path("/hadoop/conf/hadoop-default.xml"));
conf.addResource(new Path("/hadoop/conf/hadoop-site.xml"));
FileSystem fs = FileSystem.get(conf);
FSDataInputStream fin;
FSDataOutputStream foutposting, foutlexicon;

try {
fin = fs.open(new Path(args[0] )); // args[0] has the path to the raw inverted index
foutposting = fs.create(new Path(args[1] + ".pos" )); // posting file with name in args[1]
foutlexicon = fs.create(new Path(args[1] + ".lex")); // term lexicon with name in args[1]
char c;
String t=null;
int progress=0;
int freq;
BufferedReader reader = new BufferedReader(new InputStreamReader(fin));
while ((t=reader.readLine()) != null) {
// each line corresponds to all the entries for a different term
// it starts with the term itself with a sequence of (docID, termFreq) pairs
// representing the documents containing the term as well as the corresponding term counts.
StringTokenizer st = new StringTokenizer(t);
foutlexicon.writeUTF(st.nextToken()); // fetch the first string (which is the term) and
// write it to the lexicon

int df=0;
int count=0;
long pos = foutposting.getPos();
// remember the current position in the new posting file
// so that we can easily calculate the span of the postings for this term later

while (st.hasMoreTokens()) {
// iterate over all the (docID count) pairs and copy them to foutposting.
// first, copy the docID using foutposting.writeUTF.

foutposting.writeUTF(st.nextToken());

if (st.hasMoreTokens()) {
// we should expect another token for the term frequency/count
freq = Integer.parseInt(st.nextToken().trim());
//#########################################################//
// add a statement here so that in the end of the loop "count" would have the total
// count of the term in all the documents
// Hint: how to update "count"?
//
//#########################################################//
count = count + freq;
foutposting.writeInt(freq); // copy the term frequency/count to foutposting
} else {
System.err.println("Term frequency is expected");
}

//#########################################################//
// add a statement here to use "df" to count how many documents contain the term
// Hint: how to update "df"?
//
//#########################################################//
df = df + 1;
}
int len= new Long(foutposting.getPos()-pos).intValue(); // this tells us the span of the postering entries for this term

// the following four statements write out df, count, pos, and len to foutlexicon
// recall that the term was already written to foutlexicon.
foutlexicon.writeInt(df);
foutlexicon.writeInt(count);
foutlexicon.writeLong(pos);
foutlexicon.writeInt(len);
progress++;
if (progress % 5000 ==0) {
System.out.println(progress + " terms processed");
}
}
foutlexicon.close();
foutposting.close();
} catch (IOException ioe) {
System.out.println("can't open file "+args[0] + " or can't create the term index lexicon:" + args[1]);
System.exit(1);
}


}

}


Loading

0 comments on commit a3ea9c6

Please sign in to comment.