• 大小: 22.36MB
    文件类型: .zip
    金币: 1
    下载: 0 次
    发布日期: 2023-07-03
  • 语言: 其他
  • 标签: 垃圾邮件  spam  ham  

资源简介

该数据集由一系列邮件组成,适用于测试垃圾邮件过滤系统,请勿用作商业目的。

资源截图

代码片段和文件信息

#!/usr/bin/python
# FileName: Subsampling.py 
# Version 1.0 by Tao Ban 2010.5.26
# This function extract all the contents ie subject and first part from the .eml file 
# and store it in a new file with the same name in the dst dir. 

import email.parser 
import os sys stat
import shutil

def ExtractSubPayload (filename):
‘‘‘ Extract the subject and payload from the .eml file.

‘‘‘
if not os.path.exists(filename): # dest path doesnot exist
print “ERROR: input file does not exist:“ filename
os.exit(1)
fp = open(filename)
msg = email.message_from_file(fp)
payload = msg.get_payload()
if type(payload) == type(list()) :
payload = payload[0] # only use the first part of payload
sub = msg.get(‘subject‘)
sub = str(sub)
if type(payload) != type(‘‘) :
payload = str(payload)

return sub + payload

def ExtractBodyFromDir ( srcdir dstdir ):
‘‘‘Extract the body information from all .eml files in the srcdir and 

save the file to the dstdir with the same name.‘‘‘
if not os.path.exists(dstdir): # dest path doesnot exist
os.makedirs(dstdir)  
files = os.listdir(srcdir)
for file in files:
srcpath = os.path.join(srcdir file)
dstpath = os.path.join(dstdir file)
src_info = os.stat(srcpath)
if stat.S_ISDIR(src_info.st_mode): # for subfolders recurse
ExtractBodyFromDir(srcpath dstpath)
else:  # copy the file
body = ExtractSubPayload (srcpath)
dstfile = open(dstpath ‘w‘)
dstfile.write(body)
dstfile.close()


###################################################################
# main function start here
# srcdir is the directory where the .eml are stored
print ‘Input source directory: ‘ #ask for source and dest dirs
srcdir = raw_input()
if not os.path.exists(srcdir):
print ‘The source directory %s does not exist exit...‘ % (srcdir)
sys.exit()
# dstdir is the directory where the content .eml are stored
print ‘Input destination directory: ‘ #ask for source and dest dirs
dstdir = raw_input()
if not os.path.exists(dstdir):
print ‘The destination directory is newly created.‘
os.makedirs(dstdir)

###################################################################
ExtractBodyFromDir ( srcdir dstdir ) 


 属性            大小     日期    时间   名称
----------- ---------  ---------- -----  ----
     目录           0  2010-05-28 16:46  CSDMC2010_SPAM\
     目录           0  2010-05-28 16:46  CSDMC2010_SPAM\CSDMC2010_SPAM\
     文件        2177  2010-05-27 09:28  CSDMC2010_SPAM\CSDMC2010_SPAM\ExtractContent.py
     文件        3411  2010-05-27 09:29  CSDMC2010_SPAM\CSDMC2010_SPAM\readme.txt
     文件       77886  2010-05-27 06:27  CSDMC2010_SPAM\CSDMC2010_SPAM\SPAMTrain.label
     目录           0  2010-05-28 16:47  CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\
     文件        6215  2010-05-27 06:01  CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00000.eml
     文件        6484  2010-05-27 06:01  CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00001.eml
     文件        7705  2010-05-27 06:01  CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00002.eml
     文件        6260  2010-05-27 06:01  CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00003.eml
     文件       33094  2010-05-27 06:01  CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00004.eml
     文件       49320  2010-05-27 06:01  CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00005.eml
     文件        3163  2010-05-27 06:01  CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00006.eml
     文件        2519  2010-05-27 06:01  CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00007.eml
     文件       30295  2010-05-27 06:01  CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00008.eml
     文件        2514  2010-05-27 06:01  CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00009.eml
     文件       13698  2010-05-27 06:01  CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00010.eml
     文件        5639  2010-05-27 06:01  CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00011.eml
     文件        1098  2010-05-27 06:01  CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00012.eml
     文件        5555  2010-05-27 06:01  CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00013.eml
     文件        6049  2010-05-27 06:01  CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00014.eml
     文件        4667  2010-05-27 06:01  CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00015.eml
     文件        3945  2010-05-27 06:01  CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00016.eml
     文件        7610  2010-05-27 06:01  CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00017.eml
     文件        3487  2010-05-27 06:01  CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00018.eml
     文件        5110  2010-05-27 06:01  CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00019.eml
     文件        5037  2010-05-27 06:01  CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00020.eml
     文件        6634  2010-05-27 06:01  CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00021.eml
     文件        6406  2010-05-27 06:01  CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00022.eml
     文件        2297  2010-05-27 06:01  CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00023.eml
     文件        3867  2010-05-27 06:01  CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00024.eml
............此处省略8595个文件信息

评论

共有 条评论