• 大小: 3.9MB
    文件类型: .rar
    金币: 1
    下载: 0 次
    发布日期: 2023-10-28
  • 语言: 其他
  • 标签: Hadoop  文本分类  TF  

资源简介

基于Hadoop的文本分类算法系统,本系统实现了分词处理,停用词处理(IK);使用朴素贝叶斯分类算法来对文本进行训练和分类,在测试过程中使用词频特征选择作为特征词选择算法,分类准确率达到了78%,包含卡方特征选择算法(训练集特征选择)。

资源截图

代码片段和文件信息

package cn.edu.cqut.bean;

public class CategoryTotal implements Comparable{
private String word;
private String category;
private double times;
private double categoryFileTotal;
private double totalFile;

/**
 * @return the word
 */
public String getWord() {
return word;
}
/**
 * @param word the word to set
 */
public void setWord(String word) {
this.word = word;
}
/**
 * @return the category
 */
public String getCategory() {
return category;
}
/**
 * @param category the category to set
 */
public void setCategory(String category) {
this.category = category;
}
/**
 * @return the times
 */
public double getTimes() {
return times;
}
/**
 * @param times the times to set
 */
public void setTimes(double times) {
this.times = times;
}
/**
 * @return the categoryFileTotal
 */
public double getCategoryFileTotal() {
return categoryFileTotal;
}
/**
 * @param categoryFileTotal the categoryFileTotal to set
 */
public void setCategoryFileTotal(double categoryFileTotal) {
this.categoryFileTotal = categoryFileTotal;
}
/**
 * @return the totalFile
 */
public double getTotalFile() {
return totalFile;
}
/**
 * @param totalFile the totalFile to set
 */
public void setTotalFile(double totalFile) {
this.totalFile = totalFile;
}
public CategoryTotal() {
super();
}
public CategoryTotal(String word String category double times double categoryFileTotal double totalFile) {
super();
this.word = word;
this.category = category;
this.times = times;
this.categoryFileTotal = categoryFileTotal;
this.totalFile = totalFile;
}
public CategoryTotal(String category double categoryFileTotal double totalFile) {
super();
this.category = category;
this.categoryFileTotal = categoryFileTotal;
this.totalFile = totalFile;
}
@Override
public int compareTo(CategoryTotal o) {
int out = -2;
if(this.word.equals(o.getWord())){
out = 0;
}
if(this.category.equals(o.getCategory())){
out = 0;
}
if(this.times==o.getTimes()){
out = 0;
}
if(this.categoryFileTotal==o.getCategoryFileTotal()){
out = 0;
}
if(this.totalFile==o.getTotalFile()){
out = 0;
}
return out;
}
}

 属性            大小     日期    时间   名称
----------- ---------  ---------- -----  ----

     文件        794  2014-05-21 20:22  NativeBayes\.classpath

     文件        370  2014-05-21 19:38  NativeBayes\.project

     文件        587  2014-05-21 19:38  NativeBayes\.settings\org.eclipse.jdt.core.prefs

     文件       2346  2014-06-09 05:07  NativeBayes\bin\cn\edu\cqut\bean\CategoryTotal.class

     文件       2396  2014-06-09 05:07  NativeBayes\bin\cn\edu\cqut\mapreduce\FileTotal$FileCountMapper.class

     文件       3937  2014-06-09 05:07  NativeBayes\bin\cn\edu\cqut\mapreduce\FileTotal$FileCountReducer.class

     文件        479  2014-06-09 05:07  NativeBayes\bin\cn\edu\cqut\mapreduce\FileTotal.class

     文件       2412  2014-06-09 05:07  NativeBayes\bin\cn\edu\cqut\mapreduce\ModelTrain$ModelTrainCombiner.class

     文件       3069  2014-06-09 05:07  NativeBayes\bin\cn\edu\cqut\mapreduce\ModelTrain$ModelTrainMapper.class

     文件       5182  2014-06-09 05:07  NativeBayes\bin\cn\edu\cqut\mapreduce\ModelTrain$ModelTrainReducer.class

     文件        574  2014-06-09 05:07  NativeBayes\bin\cn\edu\cqut\mapreduce\ModelTrain.class

     文件       3349  2014-06-09 05:07  NativeBayes\bin\cn\edu\cqut\mapreduce\NativeBayes$NativeBayesCombiner.class

     文件       3076  2014-06-09 05:07  NativeBayes\bin\cn\edu\cqut\mapreduce\NativeBayes$NativeBayesMapper.class

     文件       1130  2014-06-09 05:07  NativeBayes\bin\cn\edu\cqut\mapreduce\NativeBayes$NativeBayesPartitoner.class

     文件       5079  2014-06-09 05:07  NativeBayes\bin\cn\edu\cqut\mapreduce\NativeBayes$NativeBayesReducer.class

     文件        679  2014-06-09 05:07  NativeBayes\bin\cn\edu\cqut\mapreduce\NativeBayes.class

     文件       2845  2014-06-09 05:31  NativeBayes\bin\cn\edu\cqut\run\TestWork.class

     文件       2979  2014-06-09 05:07  NativeBayes\bin\cn\edu\cqut\run\TrainWork.class

     文件       2476  2014-06-09 05:14  NativeBayes\bin\cn\edu\cqut\util\Curr.class

     文件       2638  2014-06-09 05:07  NativeBayes\bin\cn\edu\cqut\util\GetAllFilePath.class

     文件       4578  2014-06-09 05:07  NativeBayes\bin\cn\edu\cqut\util\ReadFileFromHdfs.class

     文件        412  2014-06-09 04:47  NativeBayes\bin\IKAnalyzer.cfg.xml

     文件       8137  2014-06-08 00:14  NativeBayes\bin\stopword.dic

     文件      41123  2014-05-21 19:38  NativeBayes\lib\commons-cli-1.2.jar

     文件     279781  2014-05-21 19:38  NativeBayes\lib\commons-httpclient-3.0.1.jar

     文件      38015  2014-05-21 19:38  NativeBayes\lib\commons-logging-1.0.4.jar

     文件       6839  2014-05-21 19:38  NativeBayes\lib\hadoop-0.20.2-ant.jar

     文件    2689741  2014-05-21 19:38  NativeBayes\lib\hadoop-0.20.2-core.jar

     文件      69940  2014-05-21 19:38  NativeBayes\lib\hadoop-0.20.2-tools.jar

     文件    1165347  2014-05-21 19:38  NativeBayes\lib\IKAnalyzer2012_u6.jar

............此处省略33个文件信息

评论

共有 条评论