• 大小: 4.04MB
    文件类型: .zip
    金币: 1
    下载: 0 次
    发布日期: 2023-10-28
  • 语言: 其他
  • 标签: Tf  idf  词频算法  

资源简介

基于tf idf的文档集关键词提取 已经含有测试文档集 可以替换成任意需要的文档集 可以自己提供字典

资源截图

代码片段和文件信息

///////////////////////////////////////////////////////////////////
// File          :Dir txt Input
// Author        :ShuanHolmes
// Date          :2015.4.10
// Modifier      :...
// Modify Date   :...
// Description   :statics_Dir.cpp
///////////////////////////////////////////////////////////////////
#include “Statics.h“ 

extern map< string int > Dic;
extern list< string > SinStatics;
extern multiset< string > SumStatics;
extern multiset< string > Fileidf;
extern set< Word > Database;
list< WordIDF > DataOut;

void getJustCurrentFile( string path vector& files)  
{    // return file iter    
long  hFile  =  0;    // file info   
struct _finddata_t fileinfo;    
string p;    
if((hFile = _findfirst(p.assign(path).append(“\\*“).c_str()&fileinfo)) != -1)    
{      
do     
{         
if((fileinfo.attrib & _A_SUBDIR));               
else               
files.push_back(fileinfo.name);             
}while(_findnext(hFile &fileinfo) == 0);      
_findclose(hFile);    
}  
}

void WordFrequency( void )
{
SinStatics.unique();
while(!SinStatics.empty())
{
SumStatics.insert(SinStatics.back( ));
SinStatics.pop_back( );
}
SinStatics.clear(); // register clear
}

void DatabaseConstruction( float N ) // the file group 
{
multiset< string >::iterator it;
Word temp;
for(it = SumStatics.begin(); it != SumStatics.end(); it++ )
{
temp.wordfrequency = fabs(log(N/(float)SumStatics.count(*it))/log(2));
temp.word = *it;
if(Database.find(temp)==Database.end())
{
Database.insert(temp);
}
}
SinStatics.clear();
SumStatics.clear();
Dic.clear();
}

void TfidfFileInput(char *filename) // segment the sentence  store the real words
{
ifstream testfile(filename);
string testsentence;
string testword;
if (!testfile)
cerr << “Fail to open “ << filename << endl;
else
cout << “Succeed to open “ << filename << endl;
cout << “Please wait “<< filename << “ segmenting the sentences in test file!“ << endl;
while(!testfile.eof())
{
getline(testfiletestsentence‘\n‘);
string result_temp=““;
int result_len = 0;
string sentence_temp=testsentence;
int cur_sen_length=testsentence.length();
int len1len2;
while(sentence_temp!=““)
{
len1 = sentence_temp.length();
len2 = sentence_temp.length();
if(len2 > MaxWordLength) // MaxLength 
len2 = MaxWordLength;
testword = sentence_temp.substr(len1-len2);
bool isw = TFidfWordCheck( testword );
while(len2 > 2 && isw == false)
{
len2 = len2-2; // 2 Byte 1 word
testword = sentence_temp.substr(len1-len2);
isw = TFidfWordCheck( testword );
}
if(result_temp == ““)
result_temp=testword+result_temp; // continue
else
result_temp=testword+“  “+result_temp; // cut
sentence_temp=sentence_temp.substr(0len1-len2); // next sentence
}
}
testfile.close();
}

bool TFidfWordCheck(string test_word) // whether t

 属性            大小     日期    时间   名称
----------- ---------  ---------- -----  ----
     目录           0  2015-04-10 20:25  Tfidf_Calculate\
     文件        4456  2015-05-28 00:17  Tfidf_Calculate\DirInput.cpp
     文件         940  2015-05-28 00:18  Tfidf_Calculate\main.cpp
     目录           0  2015-04-11 11:46  Tfidf_Calculate\mingw5\
     文件        3268  2003-07-21 19:40  Tfidf_Calculate\mingw5\(1).txt
     文件        5626  2015-04-15 22:41  Tfidf_Calculate\mingw5\(1)Out.txt
     文件         998  2003-07-21 19:40  Tfidf_Calculate\mingw5\(10).txt
     文件        1698  2015-04-15 22:41  Tfidf_Calculate\mingw5\(10)Out.txt
     文件        1341  2003-07-21 19:40  Tfidf_Calculate\mingw5\(100).txt
     文件        2283  2015-04-15 22:41  Tfidf_Calculate\mingw5\(100)Out.txt
     文件         699  2003-07-21 19:40  Tfidf_Calculate\mingw5\(101).txt
     文件        1241  2015-04-15 22:41  Tfidf_Calculate\mingw5\(101)Out.txt
     文件         963  2003-07-21 19:40  Tfidf_Calculate\mingw5\(102).txt
     文件        1651  2015-04-15 22:41  Tfidf_Calculate\mingw5\(102)Out.txt
     文件        3045  2003-07-21 19:40  Tfidf_Calculate\mingw5\(103).txt
     文件        5183  2015-04-15 22:41  Tfidf_Calculate\mingw5\(103)Out.txt
     文件         785  2003-07-21 19:40  Tfidf_Calculate\mingw5\(104).txt
     文件        1339  2015-04-15 22:41  Tfidf_Calculate\mingw5\(104)Out.txt
     文件         814  2003-07-21 19:40  Tfidf_Calculate\mingw5\(105).txt
     文件        1442  2015-04-15 22:41  Tfidf_Calculate\mingw5\(105)Out.txt
     文件        1190  2003-07-21 19:40  Tfidf_Calculate\mingw5\(106).txt
     文件        2168  2015-04-15 22:41  Tfidf_Calculate\mingw5\(106)Out.txt
     文件        1265  2003-07-21 19:40  Tfidf_Calculate\mingw5\(107).txt
     文件        2209  2015-04-15 22:41  Tfidf_Calculate\mingw5\(107)Out.txt
     文件        1157  2003-07-21 19:40  Tfidf_Calculate\mingw5\(108).txt
     文件        2001  2015-04-15 22:41  Tfidf_Calculate\mingw5\(108)Out.txt
     文件        1195  2003-07-21 19:40  Tfidf_Calculate\mingw5\(109).txt
     文件        2011  2015-04-15 22:41  Tfidf_Calculate\mingw5\(109)Out.txt
     文件         788  2003-07-21 19:40  Tfidf_Calculate\mingw5\(11).txt
     文件        1400  2015-04-15 22:41  Tfidf_Calculate\mingw5\(11)Out.txt
     文件        1000  2003-07-21 19:40  Tfidf_Calculate\mingw5\(110).txt
............此处省略308个文件信息

评论

共有 条评论