-
Notifications
You must be signed in to change notification settings - Fork 24
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Hao Wang
committed
May 17, 2017
1 parent
7ccd2fd
commit a3ea9c6
Showing
321 changed files
with
317,339 additions
and
0 deletions.
There are no files selected for viewing
3,720 changes: 3,720 additions & 0 deletions
3,720
410TextInformationSystems/MPs/cs410/assign4/exp/qrel
Large diffs are not rendered by default.
Oops, something went wrong.
Binary file added
BIN
+2.06 KB
410TextInformationSystems/MPs/cs410/assign4/obj/ComputeDocLen$MapClass.class
Binary file not shown.
Binary file added
BIN
+1.8 KB
410TextInformationSystems/MPs/cs410/assign4/obj/ComputeDocLen$Reduce.class
Binary file not shown.
Binary file not shown.
Binary file added
BIN
+2.83 KB
410TextInformationSystems/MPs/cs410/assign4/obj/InvertedIndex$MapClass.class
Binary file not shown.
Binary file added
BIN
+1.85 KB
410TextInformationSystems/MPs/cs410/assign4/obj/InvertedIndex$Reduce.class
Binary file not shown.
Binary file not shown.
Binary file not shown.
156 changes: 156 additions & 0 deletions
156
410TextInformationSystems/MPs/cs410/assign4/src/ComputeDocLen.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,156 @@ | ||
|
||
import java.io.IOException; | ||
import java.util.ArrayList; | ||
import java.util.Iterator; | ||
import java.util.List; | ||
import java.util.StringTokenizer; | ||
|
||
import java.util.HashMap; | ||
|
||
|
||
import org.apache.hadoop.conf.Configuration; | ||
import org.apache.hadoop.conf.Configured; | ||
import org.apache.hadoop.fs.Path; | ||
import org.apache.hadoop.io.IntWritable; | ||
import org.apache.hadoop.io.LongWritable; | ||
import org.apache.hadoop.io.Text; | ||
import org.apache.hadoop.mapred.FileInputFormat; | ||
import org.apache.hadoop.mapred.FileOutputFormat; | ||
import org.apache.hadoop.mapred.JobClient; | ||
import org.apache.hadoop.mapred.JobConf; | ||
import org.apache.hadoop.mapred.MapReduceBase; | ||
import org.apache.hadoop.mapred.Mapper; | ||
import org.apache.hadoop.mapred.OutputCollector; | ||
import org.apache.hadoop.mapred.Reducer; | ||
import org.apache.hadoop.mapred.Reporter; | ||
import org.apache.hadoop.util.Tool; | ||
import org.apache.hadoop.util.ToolRunner; | ||
|
||
/** | ||
* This is an Hadoop Map/Reduce application that computes document lengths based on | ||
* the "raw inverted index" (i.e., output generated by "InvertedIndex"). | ||
* | ||
* To run: hadoop jar simir.jar ComputeDocLen | ||
* [-m <i>maps</i>] [-r <i>reduces</i>] <i>in-dir</i> <i>out-dir</i> | ||
* "in-dir" has all the raw inverted index files generated by "InvertedIndex" | ||
* "out-dir" is the directory to put the document length table. | ||
*/ | ||
public class ComputeDocLen extends Configured implements Tool { | ||
|
||
/** | ||
* | ||
* For each line of input, skip the first string (the term), then read each pair (docID, termCount), | ||
* and emit (<b>docID</b>, <b>termCount</b>). | ||
*/ | ||
public static class MapClass extends MapReduceBase | ||
implements Mapper<LongWritable, Text, Text, Text> { | ||
|
||
private final static IntWritable one = new IntWritable(1); | ||
private Text word = new Text(); | ||
private Text did = new Text(); | ||
|
||
public void map(LongWritable key, Text value, | ||
OutputCollector<Text, Text> output, | ||
Reporter reporter) throws IOException { | ||
String line = value.toString(); | ||
StringTokenizer itr = new StringTokenizer(line); | ||
String docID =""; | ||
String term =""; | ||
|
||
if (itr.hasMoreTokens()) { | ||
term=itr.nextToken(); | ||
while (itr.hasMoreTokens()) { | ||
docID = itr.nextToken(); | ||
did.set(docID); | ||
word.set(itr.nextToken()); | ||
output.collect(did,word); | ||
} | ||
} | ||
} | ||
} | ||
|
||
/** | ||
* A reducer class that just emits the sum of the input values. | ||
*/ | ||
public static class Reduce extends MapReduceBase | ||
implements Reducer<Text, Text, Text, Text> { | ||
|
||
Text s= new Text(); | ||
public void reduce(Text key, Iterator<Text> values, | ||
OutputCollector<Text, Text> output, | ||
Reporter reporter) throws IOException { | ||
String sum = ""; | ||
int count=0; | ||
while (values.hasNext()) { | ||
count = count + Integer.parseInt(values.next().toString().trim()) ; | ||
} | ||
Text t = new Text(); | ||
t.set(Integer.toString(count)); | ||
output.collect(key,t); | ||
} | ||
} | ||
|
||
static int printUsage() { | ||
System.out.println("ComputeDocLen [-m <maps>] [-r <reduces>] <input> <output>"); | ||
ToolRunner.printGenericCommandUsage(System.out); | ||
return -1; | ||
} | ||
|
||
/** | ||
* The main driver for ComputeDocLen map/reduce program. | ||
* Invoke this method to submit the map/reduce job. | ||
* @throws IOException When there is communication problems with the | ||
* job tracker. | ||
*/ | ||
public int run(String[] args) throws Exception { | ||
JobConf conf = new JobConf(getConf(), ComputeDocLen.class); | ||
conf.setJobName("computerdoclength"); | ||
|
||
// the keys are words (strings) | ||
conf.setOutputKeyClass(Text.class); | ||
// the values are strings too | ||
conf.setOutputValueClass(Text.class); | ||
|
||
conf.setMapperClass(MapClass.class); | ||
conf.setCombinerClass(Reduce.class); | ||
conf.setReducerClass(Reduce.class); | ||
|
||
List<String> other_args = new ArrayList<String>(); | ||
for(int i=0; i < args.length; ++i) { | ||
try { | ||
if ("-m".equals(args[i])) { | ||
conf.setNumMapTasks(Integer.parseInt(args[++i])); | ||
} else if ("-r".equals(args[i])) { | ||
conf.setNumReduceTasks(Integer.parseInt(args[++i])); | ||
} else { | ||
other_args.add(args[i]); | ||
} | ||
} catch (NumberFormatException except) { | ||
System.out.println("ERROR: Integer expected instead of " + args[i]); | ||
return printUsage(); | ||
} catch (ArrayIndexOutOfBoundsException except) { | ||
System.out.println("ERROR: Required parameter missing from " + | ||
args[i-1]); | ||
return printUsage(); | ||
} | ||
} | ||
// Make sure there are exactly 2 parameters left. | ||
if (other_args.size() != 2) { | ||
System.out.println("ERROR: Wrong number of parameters: " + | ||
other_args.size() + " instead of 2."); | ||
return printUsage(); | ||
} | ||
FileInputFormat.setInputPaths(conf, other_args.get(0)); | ||
FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1))); | ||
|
||
JobClient.runJob(conf); | ||
return 0; | ||
} | ||
|
||
|
||
public static void main(String[] args) throws Exception { | ||
int res = ToolRunner.run(new Configuration(), new ComputeDocLen(), args); | ||
System.exit(res); | ||
} | ||
|
||
} |
122 changes: 122 additions & 0 deletions
122
410TextInformationSystems/MPs/cs410/assign4/src/IndexGeneration.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
import java.io.*; | ||
import java.nio.ByteBuffer; | ||
import java.nio.channels.FileChannel; | ||
import java.util.*; | ||
|
||
|
||
import org.apache.hadoop.conf.Configuration; | ||
import org.apache.hadoop.fs.FSDataInputStream; | ||
import org.apache.hadoop.fs.FSDataOutputStream; | ||
import org.apache.hadoop.fs.FileSystem; | ||
import org.apache.hadoop.fs.Path; | ||
|
||
|
||
/// This application takes the raw inverted index generated from InvertedIndex | ||
/// and generates two index files: (1) term lexicon with basic term statistics and | ||
/// pointers to the posting files. (2) postings. | ||
/// All the files are HDFS files, so they can potentially support indexing large collections | ||
/// | ||
/// The program goes through the original raw inverted index generated from InvertedIndex sequentially | ||
/// and rewrite the postings to a new file and record the starting points of entries for each term | ||
/// in the term lexicon. | ||
/// | ||
/// Note that the generated posting file is actually very similar to the original inverted index file, | ||
/// so we could have kept the original inverted index file as the posting file, but in a polished | ||
/// inverted index, we would represent everything as integers and compress them, so this extra | ||
/// step is conceptually necessary. | ||
/// | ||
/// usage: hadoop jar simir.jar IndexGeneration Path-to-rawPosting IndexFileName | ||
/// "path-to-rawposting" points to a raw inverted index/posting file generated by InvertedIndex | ||
/// (the current implementation can only take one file) | ||
/// "IndexFileName" is the name (including the path) for the inverted index to be created. | ||
/// Two files will be generated: "IndexFileName.lex" for the lexicon and "IndexFileName.pos" for postings. | ||
|
||
public class IndexGeneration { | ||
|
||
public static void main (String [] args) throws IOException { | ||
|
||
/// the following is basic setup needed to access HDFS files | ||
Configuration conf = new Configuration(); | ||
conf.addResource(new Path("/hadoop/conf/hadoop-default.xml")); | ||
conf.addResource(new Path("/hadoop/conf/hadoop-site.xml")); | ||
FileSystem fs = FileSystem.get(conf); | ||
FSDataInputStream fin; | ||
FSDataOutputStream foutposting, foutlexicon; | ||
|
||
try { | ||
fin = fs.open(new Path(args[0] )); // args[0] has the path to the raw inverted index | ||
foutposting = fs.create(new Path(args[1] + ".pos" )); // posting file with name in args[1] | ||
foutlexicon = fs.create(new Path(args[1] + ".lex")); // term lexicon with name in args[1] | ||
char c; | ||
String t=null; | ||
int progress=0; | ||
int freq; | ||
BufferedReader reader = new BufferedReader(new InputStreamReader(fin)); | ||
while ((t=reader.readLine()) != null) { | ||
// each line corresponds to all the entries for a different term | ||
// it starts with the term itself with a sequence of (docID, termFreq) pairs | ||
// representing the documents containing the term as well as the corresponding term counts. | ||
StringTokenizer st = new StringTokenizer(t); | ||
foutlexicon.writeUTF(st.nextToken()); // fetch the first string (which is the term) and | ||
// write it to the lexicon | ||
|
||
int df=0; | ||
int count=0; | ||
long pos = foutposting.getPos(); | ||
// remember the current position in the new posting file | ||
// so that we can easily calculate the span of the postings for this term later | ||
|
||
while (st.hasMoreTokens()) { | ||
// iterate over all the (docID count) pairs and copy them to foutposting. | ||
// first, copy the docID using foutposting.writeUTF. | ||
|
||
foutposting.writeUTF(st.nextToken()); | ||
|
||
if (st.hasMoreTokens()) { | ||
// we should expect another token for the term frequency/count | ||
freq = Integer.parseInt(st.nextToken().trim()); | ||
//#########################################################// | ||
// add a statement here so that in the end of the loop "count" would have the total | ||
// count of the term in all the documents | ||
// Hint: how to update "count"? | ||
// | ||
//#########################################################// | ||
count = count + freq; | ||
foutposting.writeInt(freq); // copy the term frequency/count to foutposting | ||
} else { | ||
System.err.println("Term frequency is expected"); | ||
} | ||
|
||
//#########################################################// | ||
// add a statement here to use "df" to count how many documents contain the term | ||
// Hint: how to update "df"? | ||
// | ||
//#########################################################// | ||
df = df + 1; | ||
} | ||
int len= new Long(foutposting.getPos()-pos).intValue(); // this tells us the span of the postering entries for this term | ||
|
||
// the following four statements write out df, count, pos, and len to foutlexicon | ||
// recall that the term was already written to foutlexicon. | ||
foutlexicon.writeInt(df); | ||
foutlexicon.writeInt(count); | ||
foutlexicon.writeLong(pos); | ||
foutlexicon.writeInt(len); | ||
progress++; | ||
if (progress % 5000 ==0) { | ||
System.out.println(progress + " terms processed"); | ||
} | ||
} | ||
foutlexicon.close(); | ||
foutposting.close(); | ||
} catch (IOException ioe) { | ||
System.out.println("can't open file "+args[0] + " or can't create the term index lexicon:" + args[1]); | ||
System.exit(1); | ||
} | ||
|
||
|
||
} | ||
|
||
} | ||
|
||
|
Oops, something went wrong.