VSM模型构建样例项目文件

大小: 320KB

文件类型: .zip

金币: 2

下载: 1 次

发布日期: 2021-06-05
语言: Python
标签: python VSM

高速下载

资源简介

基于python3 编写的VSM模型构建样例，包含分词处理后的输入文件以及停用词文件

资源截图

小图大图

代码片段和文件信息

# -*- coding: utf-8 -*-
import json
import math
import numpy as np


def read_file（path）:
    data = []
    try:
        with open（path “r“ encoding=‘utf-8‘） as fin:
            for line in fin:
                temp_line = line.replace（‘\t‘ ‘ ‘）.strip（‘\r\n‘）.split（‘ ‘）
                data.append（temp_line）
            return data
    except:
        print（“read error\n“）


def save_file（path data）:
    with open（path “w“ encoding=‘utf-8‘） as fout:
        fout.write（str（data））


# 去除停用词
def wipe_stopwords（filepath data）:
    stop_words = []
    with open（filepath ‘r‘ encoding=‘utf-8‘） as fin:
        for line in fin:
            stop_words.append（line.strip（））

        new_data = []
        for line in data:
            temp_line = []
            is_first = True
            for word in line:
                if is_first:
                    is_first = False
                    continue
                if word not in stop_words:
                    if word != ‘‘ and word != ‘\n‘ and word != u‘\u3000‘:
                        temp_line.append（word）
            new_data.append（temp_line）
        return new_data


# tfidf值计算
def calc_tfidf（data）:
    tf = []
    for line in data:
        tf_dic = {}
        for index in range（0 len（line））:  # Tf值计算
            if line[index] not in tf_dic:
                tf_dic[line[index]] = 1
            else:
                tf_dic[line[index]] += 1
        tf.append（tf_dic）

    tf_list = []
    for i in tf:  # tf 公式：tf = （0.5 + 0.5*（tf/maxTf））*（1/len（w））
        sort_dic = sorted（i.items（） key=lambda d: d[1] reverse=True）  # 关键词重要性排序
        temp_dic = {}
        for j in range（0 len（sort_dic））:
            max_tf = sort_dic[0][1]
            temp_dic[sort_dic[j][0]] = （0.5 + 0.5*（sort_dic[j][1]/max_tf）） * （1.0/len（sort_dic））
        tf_list.append（temp_dic）

    idf = []
    for line in tf_list:  # 统计在所有文档中出现次数
        temp_dic = {}
        for word in line:
            for check_line in tf_list:
                if word in check_line:
                    if word not in temp_dic:
                        temp_dic[word] = 1
                    else:
                        temp_dic[word] += 1
        idf.append（temp_dic）

    file_len = len（idf）
    tf_idf = []
    for line in range（0 len（idf））:  # tf-idf值计算，公式: idf = ln（N/n） tf-idf = idf*tf
        temp_dic = {}
        for word in idf[line]:
            temp_dic[word] = math.log（（file_len + 1）/int（idf[line][word]）） * tf_list[line][word]
        tf_idf.append（temp_dic）

    tf_idf_list = []
    for i in tf_idf:
        sort_dic = dict（sorted（i.items（） key=lambda d: d[1] reverse=True））
        tf_idf_list.append（sort_dic）

    # save_file（‘tf_idf_result.txt‘ tf_idf_list）
    return tf_idf_list


# 余弦值计算
def calc_cos（data）:
    cos_value = {}
    for doc1 in range（len（data））:  # 最终结果保存格式为：[文档序号1-文档序号2]：余弦值
        for doc2 in range（doc1 + 1  len（d

属性            大小     日期    时间   名称
----------- ---------  ---------- -----  ----
     目录           0  2018-05-31 21:21  VSM\
     文件        3659  2018-05-28 10:20  VSM\Chinese-StopWords.txt
     文件      202230  2013-05-22 13:30  VSM\input.txt
     文件        4158  2018-05-31 21:17  VSM\main.py
     文件      306205  2018-05-30 10:27  VSM\result.json
     文件      506475  2018-05-29 13:20  VSM\tf_idf_result.txt

上一篇：Python获取气象网站中的台风详细数据
下一篇：python深度学习电子版+源码

共有条评论

VSM模型构建样例项目文件

资源简介

资源截图

代码片段和文件信息

评论

相关资源