• 大小:
    文件类型: .zip
    金币: 2
    下载: 1 次
    发布日期: 2023-09-25
  • 语言: 其他
  • 标签: python  

资源简介

微博情感分析,文本分类,毕业设计项目

资源截图

代码片段和文件信息

import random
import re
import traceback

import jieba
import numpy as np
from sklearn.externals import joblib
from sklearn.naive_bayes import MultinomialNB

jieba.load_userdict(“train/word.txt“)
stop = [line.strip() for line in open(‘ad/stop.txt‘ ‘r‘ encoding=‘utf-8‘).readlines()]  # 停用词


def build_key_word(path):  # 通过词频产生特征
    d = {}
    with open(path encoding=“utf-8“) as fp:
        for line in fp:
            for word in jieba.cut(line.strip()):
                p = re.compile(r‘\w‘ re.L)
                result = p.sub(““ word)
                if not result or result == ‘ ‘:  # 空字符
                    continue
                if len(word) > 1:  # 避免大量无意义的词语进入统计范围
                    d[word] = d.get(word 0) + 1
    kw_list = sorted(d key=lambda x: d[x] reverse=True)
    size = int(len(kw_list) * 0.2)  # 取最前的30%
    mood = set(kw_list[:size])
    return list(mood - set(stop))


def loadDataSet(path):  # 返回每条微博的分词与标签
    line_cut = []
    label = []
    with open(path encoding=“utf-8“) as fp:
        for line in fp:
            temp = line.strip()
            try:
                sentence = temp[2:].lstrip()  # 每条微博
                label.append(int(temp[:2]))  # 获取标注
                word_list = []
                sentence = str(sentence).replace(‘\u200b‘ ‘‘)
                for word in jieba.cut(sentence.strip()):
                    p = re.compile(r‘\w‘ re.L)
                    result = p.sub(““ word)
                    if not result or result == ‘ ‘:  # 空字符
                        continue
                    word_list.append(word)
                word_list = list(set(word_list) - set(stop) - set(‘\u200b‘)
                                 - set(‘ ‘) - set(‘\u3000‘) - set(‘️‘))
                line_cut.append(word_list)
            except Exception:
                continue
    return line_cut label  # 返回每条微博的分词和标注


def setOfWordsToVecTor(vocabularyList moodWords):  # 每条微博向量化
    vocabMarked = [0] * len(vocabularyList)
    for smsWord in moodWords:
        if smsWord in vocabularyList:
            vocabMarked[vocabularyList.index(smsWord)] += 1
    return np.array(vocabMarked)


def setOfWordsListToVecTor(vocabularyList train_mood_array):  # 将所有微博准备向量化
    vocabMarkedList = []
    for i in range(len(train_mood_array)):
        vocabMarked = setOfWordsToVecTor(vocabularyList train_mood_array[i])
        vocabMarkedList.append(vocabMarked)
    return vocabMarkedList


def trainingNaiveBayes(train_mood_array label):  # 计算先验概率
    numTrainDoc = len(train_mood_array)
    numWords = len(train_mood_array[0])
    prior_Pos prior_Neg prior_Neutral = 0.0 0.0 0.0
    for i in label:
        if i == 1:
            prior_Pos = prior_Pos + 1
        elif i == 2:
            prior_Neg = prior_Neg + 1
        else:
            prior_Neutral = prior_Neutral + 1
    prior_Pos = prior_Pos / float(numTrainDoc)
    prior_Neg = prior_Neg / float(numTrainDoc)
    prior_Neutral = prior_Neutral / float(numTrainDoc)
    wordsInPosNum = np.ones

 属性            大小     日期    时间   名称
----------- ---------  ---------- -----  ----
     目录           0  2017-12-17 09:54  weiboanalysis-master\
     文件        6977  2017-12-17 09:54  weiboanalysis-master\Bayes.py
     文件       11357  2017-12-17 09:54  weiboanalysis-master\LICENSE
     文件        1765  2017-12-17 09:54  weiboanalysis-master\README.md
     文件        3357  2017-12-17 09:54  weiboanalysis-master\SVM.py
     目录           0  2017-12-17 09:54  weiboanalysis-master\ad\
     文件       45522  2017-12-17 09:54  weiboanalysis-master\ad\advertise.txt
     文件       66038  2017-12-17 09:54  weiboanalysis-master\ad\normal.txt
     文件       13407  2017-12-17 09:54  weiboanalysis-master\ad\stop.txt
     文件          79  2017-12-17 09:54  weiboanalysis-master\ad\train.txt
     目录           0  2017-12-17 09:54  weiboanalysis-master\doc\
     文件     1295601  2017-12-17 09:54  weiboanalysis-master\doc\基于AdaBoost算法的情感分析研究.docx
     文件        2385  2017-12-17 09:54  weiboanalysis-master\draw_pic.py
     文件         347  2017-12-17 09:54  weiboanalysis-master\from_database.py
     文件         749  2017-12-17 09:54  weiboanalysis-master\jiebatest.py
     目录           0  2017-12-17 09:54  weiboanalysis-master\model\
     文件         539  2017-12-17 09:54  weiboanalysis-master\model\gnb.model
     文件          92  2017-12-17 09:54  weiboanalysis-master\model\gnb.model_01.npy
     文件         104  2017-12-17 09:54  weiboanalysis-master\model\gnb.model_02.npy
     文件        7352  2017-12-17 09:54  weiboanalysis-master\model\gnb.model_03.npy
     文件         104  2017-12-17 09:54  weiboanalysis-master\model\gnb.model_04.npy
     文件        7352  2017-12-17 09:54  weiboanalysis-master\model\gnb.model_05.npy
     目录           0  2017-12-17 09:54  weiboanalysis-master\multi_AdaBoost\
     文件        7951  2017-12-17 09:54  weiboanalysis-master\multi_AdaBoost\Bayes.py
     目录           0  2017-12-17 09:54  weiboanalysis-master\multi_AdaBoost\model\
     文件       15554  2017-12-17 09:54  weiboanalysis-master\multi_AdaBoost\model\gnb.model
     文件        2635  2017-12-17 09:54  weiboanalysis-master\multi_AdaBoost\multi_boost.py
     文件        4171  2017-12-17 09:54  weiboanalysis-master\multi_AdaBoost\multi_test.py
     目录           0  2017-12-17 09:54  weiboanalysis-master\ntusd\
     文件       80818  2017-12-17 09:54  weiboanalysis-master\ntusd\ntusd-negative.txt
     文件       26508  2017-12-17 09:54  weiboanalysis-master\ntusd\ntusd-positive.txt
............此处省略54个文件信息

评论

共有 条评论