山东大学大数据实验二倒排索引算法Java实现

大小: 6KB

文件类型: .java

金币: 1

下载: 0 次

发布日期: 2021-06-14
语言: Java
标签: 倒排索引 大数据 Java hadoop eclips

高速下载

资源简介

山东大学大数据课程的实验二。基于hadoop集群系统（也可以在伪分布式系统上运行）系统使用Java编写的倒排索引实现，具有使用停词表功能，使用正则表达式选择规范的单词。代码重构了setup()，map()，combiner()，partitation()和reducer()函数，功能是对文档进行倒排索引，得到一个单词有序，且单词的文件列表同样有序的倒排列表集合。

资源截图

小图大图

代码片段和文件信息

package com.Test4;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;

import java.io.FileInputStream;
import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.List;
import java.util.Scanner;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class InvertedIndex {
	public static class myMap extends Mapperject Text Text IntWritable>{
		private final static IntWritable one = new IntWritable（1）;
		private URI[] remoteFiles; // 存放停用词txt文档统一资源标识符 
		private Set stopwords; //存放停用词 
		public void setup（Context context） throws IOException InterruptedException{
			Configuration conf = context.getConfiguration（）;
			 remoteFiles = Job.getInstance（conf）.getCacheFiles（）;//获取stop_words.txt 
			 stopwords = new TreeSet（）; //对于URI列表里的每一个停用词表 
			 for （int i = 0; i < remoteFiles.length; i++） { 
				 FileInputStream in =new FileInputStream（new Path（remoteFiles[i].getPath（））.getName（）.toString（））;
				 //读取文件的每一行 
				 Scanner sc =new Scanner（in）; 
				 while （sc.hasNextLine（）） {
					 String line = sc.nextLine（）;
					 String[] split = line.trim（）.split（“ “）; //trim（）:去掉空格 制表符等，split（“ “）:依据文件格式可用可不用‘
					 for （int j = 0; j < split.length; j++） { 
						 stopwords.add（split[j]）; 
					 }
				}
				sc.close（）;
			 }
    	}
		
		public void map（object key Text value Context context） throws IOException InterruptedException {
			FileSplit inputSplit = （FileSplit） context.getInputSplit（）; 
			String filename=inputSplit.getPath（）.getName（）; //获取文件名 
			//正则表达式去除特殊字符
			String str = “\\w+“;//字母或数字或下划线或汉字
			Pattern pattern = Pattern.compile（str）;
			String line = value.toString（）.toLowerCase（）;
			StringTokenizer itr = new StringTokenizer（line）;
			String temp = new String（）;
			for（; itr.hasMoreTokens（）;） {
				temp = itr.nextToken（）;
				Matcher ma = pattern.matcher（temp）;
				while（ma.find（）） {
					String word = ma.group（）;
					if（!stopwords.contains（word）） {
						Text text = new Text（）;
						text.set（word+“#“+filename）; //key = word+#+filename
						context.write（text one）;
					}
				}
			}
		}
	}
	public static class myCombiner extends Reducer{
		public void reduce（Text key Iterable values Context context） throws IOException InterruptedException {
			int sum = 0;
			for （IntWritable val : values） {

上一篇：阿里支付jar包
下一篇：成绩的查询和排序功能实现 java实现

共有条评论

山东大学 大数据实验二 倒排索引算法Java实现

资源简介

资源截图

代码片段和文件信息

评论

相关资源

山东大学大数据实验二倒排索引算法Java实现