You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
#@save
def read_ptb():
"""将PTB数据集加载到文本行的列表中"""
data_dir = d2l.download_extract('ptb')
# Readthetrainingset.
with open(os.path.join(data_dir, 'ptb.train.txt')) as f:
raw_text = f.read()
return [line.split() for line in raw_text.split('\n')]
sentences = read_ptb()
f'# sentences数: {len(sentences)}'
vocab = d2l.Vocab(sentences, min_freq=10)
f'vocab size: {len(vocab)}'
#@save
def subsample(sentences, vocab):
"""下采样高频词"""
# 排除未知词元''
sentences = [[token for token in line if vocab[token] != vocab.unk]
for line in sentences]
counter = d2l.count_corpus(sentences)
num_tokens = sum(counter.values())
# 如果在下采样期间保留词元,则返回True
def keep(token):
return(random.uniform(0, 1) <
math.sqrt(1e-4 / counter[token] * num_tokens))
return ([[token for token in line if keep(token)] for line in sentences],
counter)
Please pip install d2l==0.17.6 to use older version of d2l which has these saved functions. In the latest version, we refactored the code and removed them.
import math
import os
import random
import torch
from d2l import torch as d2l
import os
import matplotlib.pyplot as plt
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
#@save
d2l.DATA_HUB['ptb'] = (d2l.DATA_URL + 'ptb.zip',
'319d85e578af0cdc590547f26231e4e31cdf1e42')
#@save
def read_ptb():
"""将PTB数据集加载到文本行的列表中"""
data_dir = d2l.download_extract('ptb')
# Readthetrainingset.
with open(os.path.join(data_dir, 'ptb.train.txt')) as f:
raw_text = f.read()
return [line.split() for line in raw_text.split('\n')]
sentences = read_ptb()
f'# sentences数: {len(sentences)}'
vocab = d2l.Vocab(sentences, min_freq=10)
f'vocab size: {len(vocab)}'
#@save
def subsample(sentences, vocab):
"""下采样高频词"""
# 排除未知词元''
sentences = [[token for token in line if vocab[token] != vocab.unk]
for line in sentences]
counter = d2l.count_corpus(sentences)
num_tokens = sum(counter.values())
subsampled, counter = subsample(sentences, vocab)
d2l.show_list_len_pair_hist(
['origin', 'subsampled'], '# tokens per sentence',
'count', sentences, subsampled)
plt.show()
The text was updated successfully, but these errors were encountered: