资源简介

java项目 基于Hadoop对网站日志数据分析 使用MapReduce框架进行分析,并包含150M的网站日志数据

资源截图

代码片段和文件信息

package com.zzhao;

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class LogClean {

private static String FOLDER_INPUT = “Resource/input“;
private static String FOLDER_OUTPUT = “Resource/output“;
private static long PV = 0;
private static long register = 0;
private static long IP = 0;
private static long jumper = 0;

public static void main(String[] args) throws IOException ClassNotFoundException InterruptedException {
Configuration cfg = new Configuration();

Job job = Job.getInstance(cfg);
job.setMapperClass(LogCleanMapper.class);
job.setReducerClass(LogCleanReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job new Path(FOLDER_INPUT));

Path outputDir = new Path(FOLDER_OUTPUT);
FileSystem fs = FileSystem.get(cfg);
if (fs.exists(outputDir)) {
fs.delete(outputDir true);
}

FileOutputFormat.setOutputPath(job outputDir);

boolean flag = job.waitForCompletion(true);
if (flag) {
System.out.println(“\tClean process success!\n\n\n“);
System.out.println(“\tPV量:\t\t“ + PV);
System.out.println(“\t注册用户数:\t“ + register);
System.out.println(“\t独立IP数:\t“ + IP);
System.out.println(“\t跳出用户数:\t“ + jumper);
} else {
System.out.println(“Clean process failed!“);
}
}

static class LogCleanMapper extends Mapper {
LogParser logParser = new LogParser();
Text k = new Text();
Text v = new Text();

protected void map(LongWritable key Text value
org.apache.hadoop.mapreduce.Mapper.Context context)
throws java.io.IOException InterruptedException {

PV += 1;

final String[] parsed = logParser.parse(value.toString());

// step1.过滤掉静态资源访问请求
if (parsed[2].startsWith(“GET /static/“) || parsed[2].startsWith(“GET /uc_server“)) {
return;
}

// step2.过滤掉开头的指定字符串
if (parsed[2].startsWith(“GET /“)) {
parsed[2] = parsed[2].substring(“GET /“.length());
} else if (parsed[2].startsWith(“POST /“)) {
parsed[2] = parsed[2].substring(“POST /“.length());
}

// step3.过滤掉结尾的特定字符串
if (parsed[2].endsWith(“ HTTP/1.1“)) {
parsed[2] = parsed[2].substring(0 parsed[2].length() - “ HTTP/1.1“.length());
}

// step4.只写入前三个记录类型项
k.set(parsed[0]);
v.set(parsed[1] + “\t\t“ + parsed[2]);
context.write(k v);

// 判断是否新用户
if (p

 属性            大小     日期    时间   名称
----------- ---------  ---------- -----  ----

     文件       6132  2019-06-04 11:07  Hadoop\.classpath

     文件        382  2019-06-04 10:27  Hadoop\.project

     文件         57  2019-06-04 14:15  Hadoop\.settings\org.eclipse.core.resources.prefs

     文件       3118  2019-06-11 15:54  Hadoop\bin\com\zzhao\LogClean$LogCleanMapper.class

     文件       2275  2019-06-11 15:54  Hadoop\bin\com\zzhao\LogClean$LogCleanReducer.class

     文件       3499  2019-06-11 15:54  Hadoop\bin\com\zzhao\LogClean.class

     文件       2529  2019-06-11 15:53  Hadoop\bin\com\zzhao\LogParser.class

     文件   61084192  2019-06-04 13:56  Hadoop\bin\input\access_2013_05_30.log

     文件  157069653  2019-06-04 14:32  Hadoop\bin\input\access_2013_05_31.log

     文件        879  2019-06-04 10:49  Hadoop\bin\log4j.properties

     文件        567  2019-06-11 15:53  Hadoop\bin\org\apache\hadoop\io\nativeio\NativeIO$CachedUid.class

     文件        644  2019-06-11 15:53  Hadoop\bin\org\apache\hadoop\io\nativeio\NativeIO$POSIX$CachedName.class

     文件       1545  2019-06-11 15:53  Hadoop\bin\org\apache\hadoop\io\nativeio\NativeIO$POSIX$CacheManipulator.class

     文件       1302  2019-06-11 15:53  Hadoop\bin\org\apache\hadoop\io\nativeio\NativeIO$POSIX$IdCache.class

     文件       1592  2019-06-11 15:53  Hadoop\bin\org\apache\hadoop\io\nativeio\NativeIO$POSIX$NoMlockCacheManipulator.class

     文件       2575  2019-06-11 15:53  Hadoop\bin\org\apache\hadoop\io\nativeio\NativeIO$POSIX$Stat.class

     文件       9842  2019-06-11 15:53  Hadoop\bin\org\apache\hadoop\io\nativeio\NativeIO$POSIX.class

     文件       1595  2019-06-11 15:53  Hadoop\bin\org\apache\hadoop\io\nativeio\NativeIO$Windows$AccessRight.class

     文件       2316  2019-06-11 15:53  Hadoop\bin\org\apache\hadoop\io\nativeio\NativeIO$Windows.class

     文件       9297  2019-06-11 15:53  Hadoop\bin\org\apache\hadoop\io\nativeio\NativeIO.class

     文件      30453  2019-06-11 15:53  Hadoop\bin\org\apache\hadoop\mapred\YARNRunner.class

     文件     428480  2019-06-04 15:48  Hadoop\bin\output\.part-r-00000.crc

     文件          8  2019-06-04 15:48  Hadoop\bin\output\._SUCCESS.crc

     文件   54844160  2019-06-04 15:48  Hadoop\bin\output\part-r-00000

     文件          0  2019-06-04 15:48  Hadoop\bin\output\_SUCCESS

     文件      62983  2016-01-14 08:45  Hadoop\lib\activation-1.1.jar

     文件       4467  2016-01-14 08:45  Hadoop\lib\aopalliance-1.0.jar

     文件      44925  2016-01-14 08:45  Hadoop\lib\apacheds-i18n-2.0.0-M15.jar

     文件     691479  2016-01-14 08:45  Hadoop\lib\apacheds-kerberos-codec-2.0.0-M15.jar

     文件      16560  2016-01-14 08:45  Hadoop\lib\api-asn1-api-1.0.0-M20.jar

............此处省略124个文件信息

评论

共有 条评论