资源简介

基于深度学习的文本相似度计算模型和代码,亲自跑过可以直接使用,对nlp领域的学习非常有借鉴意义,在智能问答系统上经常会用到。

资源截图

代码片段和文件信息

# !/usr/bin/env python  
# -*- coding:utf-8 _*-  
“““ 
@Author:yanqiang 
@File: build_input.py 
@Time: 2018/11/30 17:41
@Software: PyCharm 
@Description: 构建模型的输入
“““
from collections import Counter
from keras.preprocessing.sequence import pad_sequences
import numpy as np
from gensim.models import Word2Vec

# train = load_atec()


# train dev test=load_ccks()


def select_best_length(trainlimit_ratio=0.95):
    “““
    根据数据集的句子长度,选择最佳的样本max-length
    :param limit_ratio:句子长度覆盖度,默认覆盖95%以上的句子
    :return:
    “““
    len_list = []
    max_length = 0
    cover_rate = 0.0
    for q1 q2 in zip(train[‘q1‘] train[‘q2‘]):
        len_list.append(len(q1))
        len_list.append(len(q2))
    all_sent = len(len_list)
    sum_length = 0
    len_dict = Counter(len_list).most_common()
    for i in len_dict:
        sum_length += i[1] * i[0]
    average_length = sum_length / all_sent
    for i in len_dict:
        rate = i[1] / all_sent
        cover_rate += rate
        if cover_rate >= limit_ratio:
            max_length = i[0]
            break
    print(‘average_length:‘ average_length)
    print(‘max_length:‘ max_length)
    return max_length


# select_best_length()

#返回train_xy
def build_data(train):
    “““
    构建数据集
    :return:
    “““
    #遍历每一个样本,获取样本的问题q1的样本集合list
    sample_x_left = train.q1.apply(lambda x: [char for char in x if char]).tolist()
    # 遍历每一个样本,获取样本的问题q2的样本集合list
    sample_x_right = train.q2.apply(lambda x: [char for char in x if char]).tolist()
    vocabs = {‘UNK‘}
    #构建词汇表
    for x_left x_right in zip(sample_x_left sample_x_right):
        for char in x_left + x_right:
            vocabs.add(char)

    sample_x = [sample_x_left sample_x_right]
    sample_y = train.label.tolist()
    print(len(sample_x_left) len(sample_x_right))
    datas = [sample_x sample_y]
    #{‘这‘: 0 ‘纯‘: 1 ‘代‘: 2 ‘万‘: 3 ‘(‘: 4 ‘柳‘: 5 ‘扮‘: 6 ‘翻‘: 7 ‘水‘: 8................}
    word_dict = {wd: index for index wd in enumerate(list(vocabs))}
    #print(word_dict)
    vocab_path = ‘model/vocab.txt‘
    with open(vocab_path ‘w‘ encoding=‘utf-8‘) as f:
        f.write(‘\n‘.join(list(vocabs)))
    return datas word_dict


def convert_data(datas word_dict MAX_LENGTH):
    “““
    将数据转换成keras所能处理的格式
    :return: 
    “““
    sample_x = datas[0]
    sample_y = datas[1]
    sample_x_left = sample_x[0]
    sample_x_right = sample_x[1]
    left_x_train = [[word_dict[char] for char in data] for data in sample_x_left]
    right_x_train = [[word_dict[char] for char in data] for data in sample_x_right]
    y_train = [int(i) for i in sample_y]
    left_x_train = pad_sequences(left_x_train MAX_LENGTH padding=‘pre‘)
    right_x_train = pad_sequences(right_x_train MAX_LENGTH padding=‘pre‘)
    y_train = np.expand_dims(y_train 2)
    return left_x_train right_x_train y_train


def train_w2v(datas):
    “““
    训练词向量
    :return:
    “““
    sents = datas[0][0] + datas[0][1]
    #print(sents)
    model = Word2Vec(sentences=sents size=300 min_

 属性            大小     日期    时间   名称
----------- ---------  ---------- -----  ----

     文件       6148  2020-02-26 18:01  sentence-similarity-project\.DS_Store

     文件        141  2018-12-05 17:08  sentence-similarity-project\.gitignore

     文件        128  2020-03-28 21:48  sentence-similarity-project\.idea\libraries\R_User_Library.xml

     文件        315  2020-03-28 21:46  sentence-similarity-project\.idea\misc.xml

     文件        313  2020-03-28 21:46  sentence-similarity-project\.idea\modules.xml

     文件        611  2020-03-28 21:48  sentence-similarity-project\.idea\sentence-similarity-project.iml

     文件      16934  2020-03-29 15:07  sentence-similarity-project\.idea\workspace.xml

     文件       4702  2020-03-28 23:25  sentence-similarity-project\build_input.py

     文件       1753  2020-03-05 21:25  sentence-similarity-project\data_loader.py

     文件       1780  2020-03-05 21:40  sentence-similarity-project\evalute.py

     文件    3485318  2020-03-05 21:23  sentence-similarity-project\input\atec\atec_nlp_sim_train.csv

     文件    5625804  2018-12-05 17:08  sentence-similarity-project\input\atec\atec_nlp_sim_train_add.csv

     文件        946  2018-12-05 17:08  sentence-similarity-project\input\atec\readme.txt

     文件        609  2018-12-05 17:08  sentence-similarity-project\input\ccks\Readme

     文件     760958  2018-12-05 17:08  sentence-similarity-project\input\ccks\task3_dev.txt

     文件    7355965  2018-12-05 17:08  sentence-similarity-project\input\ccks\task3_train.txt

     文件    8555401  2018-12-05 17:08  sentence-similarity-project\input\ccks\test_with_id.txt

     文件      23854  2020-03-29 12:06  sentence-similarity-project\model\model.png

     文件      29593  2020-03-29 12:27  sentence-similarity-project\model\result_atec.png

     文件      25260  2018-12-05 17:08  sentence-similarity-project\model\result_ccks.png

     文件    8809848  2020-03-29 12:27  sentence-similarity-project\model\tokenvec_bilstm2_siamese_model.h5

     文件    7847540  2020-03-29 12:06  sentence-similarity-project\model\token_vec_300.bin

     文件      10735  2020-03-29 12:06  sentence-similarity-project\model\vocab.txt

     文件       4329  2020-03-29 12:06  sentence-similarity-project\train_siamese_network.py

     文件       5003  2020-03-28 23:25  sentence-similarity-project\__pycache__\build_input.cpython-36.pyc

     文件       1481  2020-03-28 21:48  sentence-similarity-project\__pycache__\data_loader.cpython-36.pyc

     文件     175767  2020-02-26 18:00  sentence-similarity-project\文本相似度建模.pdf

     目录          0  2020-03-28 21:48  sentence-similarity-project\.idea\inspectionProfiles

     目录          0  2020-03-28 21:48  sentence-similarity-project\.idea\libraries

     目录          0  2020-03-05 21:23  sentence-similarity-project\input\atec

............此处省略9个文件信息

评论

共有 条评论