我们用一部英文小说,即H. G. Well的Time Machine,作为示例,展示文本预处理的具体过程。
1 2 3 4 5 6 7 8 9 10 11
import collections import re
defread_time_machine(): withopen('./input/timemachine.txt', 'r', encoding='utf-8') as f: lines = [re.sub('[^a-z]+', ' ', line.strip().lower()) for line in f] return lines
deftokenize(sentences, token='word'): """Split sentences into word or char tokens""" if token == 'word': return [sentence.split(' ') for sentence in sentences] elif token == 'char': return [list(sentence) for sentence in sentences] else: print('ERROR: unkown token type '+token)
classVocab(object): def__init__(self, tokens, min_freq=0, use_special_tokens=False): counter = count_corpus(tokens) # : self.token_freqs = list(counter.items()) self.idx_to_token = [] if use_special_tokens: # padding, begin of sentence, end of sentence, unknown self.pad, self.bos, self.eos, self.unk = (0, 1, 2, 3) self.idx_to_token += ['', '', '', ''] else: self.unk = 0 self.idx_to_token += [''] self.idx_to_token += [token for token, freq in self.token_freqs if freq >= min_freq and token notin self.idx_to_token] self.token_to_idx = dict() for idx, token inenumerate(self.idx_to_token): self.token_to_idx[token] = idx
def__len__(self): returnlen(self.idx_to_token)
def__getitem__(self, tokens): ifnotisinstance(tokens, (list, tuple)): return self.token_to_idx.get(tokens, self.unk) return [self.__getitem__(token) for token in tokens]
defto_tokens(self, indices): ifnotisinstance(indices, (list, tuple)): return self.idx_to_token[indices] return [self.idx_to_token[index] for index in indices]
defcount_corpus(sentences): tokens = [tk for st in sentences for tk in st] return collections.Counter(tokens) # 返回一个字典,记录每个词的出现次数