hadoop并行化和非并行化的kmeans算法.zip

大小: 5KB

文件类型: .zip

金币: 2

下载: 2 次

发布日期: 2021-12-17
语言: 其他
标签: kmeans hadoop java map/reduce 山东大学

高速下载

资源简介

包含两种平台上运行的kmeans算法：一种是在Hadoop系统上的并行化kmeans算法，支持读文件，执行聚类算法，输出质心文件，将每个数据的聚类信息输出到控制台上；另一种是串行的聚类算法，支持读文件数据，执行kmeans算法，将每个数据的聚类信息输出到文件中。代码注释清晰。

资源截图

小图大图

代码片段和文件信息

package com.kmeans;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.LineReader;

class Center{
	protected static int k = 3;		//质心的个数
	protected static int dimension = 2; //数据的维度
	
	//从初始的质心文件中加载质心，并返回质心文件字符串，质心之间用tab分割
	public String loadInitCenter（Path path） throws IOException {
		StringBuffer sb = new StringBuffer（）;
		Configuration conf = new Configuration（）;
		FileSystem hdfs = FileSystem.get（conf）;
		FSDataInputStream dis = hdfs.open（path）;
		LineReader in = new LineReader（dis conf）;
		Text line = new Text（）;
		while（in.readLine（line） > 0） {
			sb.append（line.toString（）.trim（））;//trim（）:去掉字符串两端多余的空格
			sb.append（“\t“）;
		}
		return sb.toString（）.trim（）;
	}
	//从每次迭代的质心文件里读取质心，并返回字符串
	public String loadCenter（Path path） throws IOException {
		StringBuffer sb = new StringBuffer（）;
		Configuration conf = new Configuration（）;
		FileSystem hdfs = FileSystem.get（conf）;
		//获取文件列表
		FileStatus[] files = hdfs.listStatus（path）;
		for（int i = 0; i < files.length; i++） {
			Path filePath = files[i].getPath（）;
			if（!filePath.getName（）.contains（“part“）） continue;
			FSDataInputStream dis = hdfs.open（filePath）;
			LineReader in = new LineReader（dis conf）;
			Text line = new Text（）;
			while（in.readLine（line） > 0） {
				sb.append（line.toString（）.trim（））;
				sb.append（“\t“）;
			}
		}
		return sb.toString（）.trim（）;
	}
}

public class Kmeans {
	private static String FLAG = “a“;//用于存聚类中心信息
	
	//计算两个向量之间的 欧式距离
	public static double distance（double[] a double[] b） {
		if（a == null || b == null || a.length != b.length） return Double.MAX_VALUE;
		double d = 0;
		for（int i = 0; i < a.length; i++） {
			d += Math.pow（a[i] - b[i] 2）;
		}
		return Math.sqrt（d）;
	}
	
	public static class mapper extends Mapperject Text Text Text>{
		
		double[][] centers = new double[Center.k][];//存储每个簇中心的信息
		String[] centerstrArray = null;//用于存储聚类中心的字符串连接信息
		
		public void setup（Context context） {
			//将放在context中的聚类中心转换为数组的形式，方便使用
			String kmeansS = context.getConfiguration（）.get（FLAG）;
			centerstrArray = kmeansS.split（“\t“）;
			
			for（int i = 0; i < centerstrArray.length; i++） {
				String[] segs = centerstrArray[i].split（““）;
				centers[i] = new double[segs

属性            大小     日期    时间   名称
----------- ---------  ---------- -----  ----
     目录           0  2019-05-27 02:44  hadoop并行化和非并行化的kmeans算法\
     目录           0  2019-05-27 02:44  hadoop并行化和非并行化的kmeans算法\并行化kmeans算法\
     文件        7808  2019-05-26 11:49  hadoop并行化和非并行化的kmeans算法\并行化kmeans算法\Kmeans.java
     目录           0  2019-05-27 02:44  hadoop并行化和非并行化的kmeans算法\非并行化kmeans算法\
     文件        4515  2019-05-26 12:27  hadoop并行化和非并行化的kmeans算法\非并行化kmeans算法\Kmeans.java

上一篇：opencv_3rdparty中所有ffmpeg库
下一篇：职工工资管理系统

共有条评论

hadoop并行化和非并行化的kmeans算法.zip

资源简介

资源截图

代码片段和文件信息

评论

相关资源