-
大小: 6KB文件类型: .py金币: 1下载: 0 次发布日期: 2021-06-10
- 语言: Python
- 标签: word2vec tensorflow
资源简介
word2vec的tensorflow实现,来自黄文坚的“tensorflow实战”
代码片段和文件信息
#!/usr/bin/env python
#*- coding: utf-8 -*-
from __future__ import print_function
import os
import math
import urllib.request
import zipfile
import random
import collections
import numpy as np
import tensorflow as tf
url = ‘http://mattmahoney.net/dc/‘
‘‘‘Step1: download dataset‘‘‘
def may_download(filename expected_bytes):
if not os.path.exists(filename):
filename _ = urllib.request.urlretrieve(url + filename filename)
statinfo = os.stat(filename)
if statinfo.st_size == expected_bytes:
print(‘Found and verified‘ filename)
else:
print(statinfo.st_size)
raise Exception(‘Failed to verify ‘ + filename)
return filename
filename = may_download(‘text8.zip‘ 31344016)
‘‘‘Step2: data transformation‘‘‘
def read_data(filename):
with zipfile.ZipFile(filename) as f:
data = tf.compat.as_str_any(f.read(f.namelist()[0])).split()
return data
“““Test“““
words = read_data(filename)
print(‘Datas size‘ len(words))
‘‘‘Step3: make dataset‘‘‘
vocabulary_size = 50000
def build_dataset(words):
count = [[‘UNK‘ -1]]
count.extend(collections.Counter(words).most_common(vocabulary_size-1))
dictionary = dict()
for word _ in count:
dictionary[word] = len(dictionary)
data = list()
unk_count = 0
for word in words:
if word in dictionary:
index = dictionary[word]
else:
index = 0
unk_count += 1
data.append(index)
count[0][1] = unk_count
reverse_dictionary = dict(zip(dictionary.values() dictionary.keys()))
return data count dictionary reverse_dictionary
data count dictionary reverse_dictionary = build_dataset(words)
del words
“““Test“““
print(‘Most common words (+UNK) ‘ count[:5])
print(‘Sample data‘ data[:10] [reverse_dictionary[i] for i in data[:10]])
‘‘‘Step4: generate training samples‘‘‘
data_index = 0
def generate_batch(batch_size num_skips skip_window):
global data_index
assert batch_size % num_skips == 0
assert num_skips <= (2 *skip_window)
batch = np.ndarray(shape=(batch_size) dtype=np.int32)
labels = np.ndarray(shape=(batch_size 1) dtype=np.int32)
span = 2 * skip_window + 1
buffer = collections.deque(maxlen=span)
for _ in range(span):
buffer.append(data[data_index])
data_index = (data_index + 1) % len(data)
for i in range(batch_size // num_skips):
target = skip_window
targets_to_avoid = [skip_window]
for j in range(num_skips):
while target in targets_to_avoid:
target = random.randint(0 span -1)
targets_to_avoid.append(target)
batch[i * num_skips + j] = buffer[skip_window]
labels[i * num_skips + j 0] = buffer[target]
buffer.append(data[data_index])
data_index = ( data_index + 1) % len(data)
return batch labels
“““Test“““
batch labels = generate_batch(batch_size=8 num_skips=2 skip_window=1)
for i in range(8):
print(batch[i] reverse_dictionary[batch[i]] ‘->‘ labels[i 0] reverse_dictionary[labels[i 0]])
‘‘‘Step 5: training‘‘‘
batch_size = 128
embedding_size = 128
skip_window = 128
num_skips = 2
valid_size = 16
valid_window = 100
valid_examples =
- 上一篇:python37_d.lib文件
- 下一篇:mnist_normal
相关资源
- 基于selective_search对手写数字串进行分
- pb模型文件进行前向预测亲测可用
- tensorflow样例 BP神经网络
- TensorFlow usb摄像头视频目标检测代码
- tensorflow_gpu-2.3.1-cp37-cp37m-win_amd64.whl
- python3使用tensorflow构建CNN卷积神经网络
- tensorflow糖尿病数据二分类python代码
- 神经网络-二分类问题(IMDB) Keras
- win7 32位系统下tensorflow的安装,以及在
- Tensorflow练习1对电影评论进行分类
- tensorflow手写数字识别python源码案例
- Tensorflow之CNN实现CIFAR-10图像的分类p
- cython_bbox.so
- TensorFlow实战中实现word2vec代码含中文
- opencv_tensorflow
- tensorflow2.0实现mnist手写数字识别代码
- Python-TensorFlow语义分割组件
- tensorflow_random_forest_demo.py
- Python-利用Python实现中文文本关键词抽
- Tensorflow-BiLSTM分类
- TensorFlow实现人脸识别(3)--------对人
- Python-手势识别使用在TensorFlow中卷积神
- Python+Tensorflow+CNN实现车牌识别的
- 基于TensorFlow实现的闲聊机器人
- CBAM_MNIST.py
- TensorFlow 实现 Yolo
- 基于tensorflow的遥感影像分类
- 安装步骤。提取码也在里面
- 神经网络模型python模板
- autoencoder自编码器tensorflow代码
评论
共有 条评论