本文整理汇总了Python中vocabulary.Vocabulary类的典型用法代码示例。如果您正苦于以下问题:Python Vocabulary类的具体用法?Python Vocabulary怎么用?Python Vocabulary使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Vocabulary类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: Corpus
class Corpus(object):
def __init__(self):
self.documents = []
self.vocab = Vocabulary()
self.frozen = False
def add(self, name, tokens):
if not self.frozen:
w = [self.vocab[x] for x in tokens]
self.documents.append(Document(self, name, w))
def freeze(self):
for doc in self.documents:
doc.freeze()
self.vocab.stop_growth()
self.frozen = True
def __iter__(self):
return iter(self.documents)
def __len__(self):
return len(self.documents)
@classmethod
def load(cls, filename):
return pickle.load(file(filename, 'r'))
def save(self, filename):
pickle.dump(self, file(filename, 'wb'))
开发者ID:hannawallach,项目名称:cmpsci691bm,代码行数:34,代码来源:corpus.py
示例2: cleanUpText
def cleanUpText(self, text):
cleanedWords = []
# perform lowercase
words = text.lower().split(' ')
# get vocabulary
vocab = Vocabulary()
for word in words:
# check Portuguese stopwords
# TODO: Implement other languages tokenizers
if not (word in vocab.getPTStopWords()):
cleanedWords.append(word)
return cleanedWords
开发者ID:marcusmachado,项目名称:nbac,代码行数:12,代码来源:tokenizer.py
示例3: __extract_vocabularies_from_data
def __extract_vocabularies_from_data(self, classes):
vocabularies = set()
for c in classes:
strings = self.__access_strings(c, '/train')
vocabulary = Vocabulary(strings)
curr_vocabulary = vocabulary.get_vocabulary()
self.__write_vocabulary(c, curr_vocabulary)
vocabularies |= curr_vocabulary #append set
return sorted(vocabularies)
开发者ID:paupowpow,项目名称:classify-it,代码行数:13,代码来源:main.py
示例4: Corpus
class Corpus(object):
def __init__(self, documents=None, vocab=None, frozen=None):
if documents:
self.documents = documents
else:
self.documents = []
if vocab:
self.vocab = vocab
else:
self.vocab = Vocabulary()
if frozen:
self.frozen = frozen
else:
self.frozen = False
def add(self, name, tokens):
if not self.frozen:
w = [self.vocab[x] for x in tokens]
self.documents.append(Document(self, name, w))
def freeze(self):
for doc in self.documents:
doc.freeze()
self.vocab.stop_growth()
self.frozen = True
def __getitem__(self, i):
return self.documents[i]
def __getslice__(self, i, j):
return Corpus(self.documents[i:j], self.vocab, self.frozen)
def __iter__(self):
return iter(self.documents)
def __len__(self):
return len(self.documents)
@classmethod
def load(cls, filename):
return pickle.load(file(filename, "r"))
def save(self, filename):
pickle.dump(self, file(filename, "wb"))
开发者ID:hannawallach,项目名称:cmpsci691bm,代码行数:50,代码来源:corpus.py
示例5: VocabularyTest
class VocabularyTest(unittest.TestCase):
def setUp(self):
self.vocabulary = Vocabulary()
self.vocabulary.load('testdata/vocabulary.dat', 'testdata/custom_words')
pprint.pprint(self.vocabulary.trie)
pprint.pprint(self.vocabulary.words)
def test_vocabulary(self):
self.assertIn(u'英雄三国', self.vocabulary.words.keys())
self.assertIn(u'魔鬼代言人', self.vocabulary.words.keys())
self.assertIn(u'黄河水利委员会', self.vocabulary.words.keys())
self.assertNotIn(u'十大伪歌手', self.vocabulary.words.keys())
self.assertNotIn(u'走路太牛', self.vocabulary.words.keys())
self.assertEqual('n', self.vocabulary.get_pos(u'英雄三国'))
self.assertEqual('n', self.vocabulary.get_pos(u'魔鬼代言人'))
self.assertEqual('nt', self.vocabulary.get_pos(u'黄河水利委员会'))
self.assertEqual('UNK', self.vocabulary.get_pos(u'十大伪歌手'))
self.assertEqual('UNK', self.vocabulary.get_pos(u'走路太牛'))
def test_gen_DAG(self):
pprint.pprint(self.vocabulary.gen_DAG(
u'《英雄三国》是由网易历时四年自主研发运营的一款英雄对战竞技网游。'))
开发者ID:fandywang,项目名称:python-wordsegmenter,代码行数:25,代码来源:vocabulary_test.py
示例6: setUp
def setUp(self):
self.vocabulary = Vocabulary()
self.vocabulary.load('../data/vocabulary.dat')
self.hmm_segmenter = HMMSegmenter()
self.hmm_segmenter.load('../data/hmm_segment_model')
self.max_prob_segmenter = MaxProbSegmenter(
self.vocabulary, self.hmm_segmenter)
开发者ID:fandywang,项目名称:python-wordsegmenter,代码行数:7,代码来源:max_prob_segmenter_test.py
示例7: main
def main():
"""."""
from vocabulary import Vocabulary
from attribute import Attribute
from attribute_structure import AttributeStructure
from attribute_system import AttributeSystem
vocabulary = Vocabulary(['C'], [], ['V'])
a = Attribute("a", [])
b = Attribute("b", [])
astr = AttributeStructure(a, b)
objs = ['a', 'b', 'c']
attribute_system = AttributeSystem(astr, objs)
C = ConstantAssignment(vocabulary, attribute_system, {'C': 'a'})
print C._vocabulary
vocabulary.add_constant("C2")
print C._vocabulary
开发者ID:Wheatwizard,项目名称:pyVivid,代码行数:19,代码来源:constant_assignment.py
示例8: setUp
def setUp(self):
self.document = Document(20)
self.vocabulary = Vocabulary()
self.vocabulary.load("../testdata/vocabulary.dat")
self.model = Model(20)
self.model.load('../testdata/lda_model')
self.doc_tokens = ['macbook', 'ipad', # exist in vocabulary and model
'mac os x', 'chrome', # only exist in vocabulary
'nokia', 'null'] # inexistent
开发者ID:JackieXie168,项目名称:mltk,代码行数:11,代码来源:document_test.py
示例9: MaxProbSegmenterTest
class MaxProbSegmenterTest(unittest.TestCase):
def setUp(self):
self.vocabulary = Vocabulary()
self.vocabulary.load('../data/vocabulary.dat')
self.hmm_segmenter = HMMSegmenter()
self.hmm_segmenter.load('../data/hmm_segment_model')
self.max_prob_segmenter = MaxProbSegmenter(
self.vocabulary, self.hmm_segmenter)
def call_segment(self, text):
for word in self.max_prob_segmenter.segment(text):
print word + '/\t',
print ''
def test_segment(self):
fp = open('testdata/document.dat', 'rb')
for text in fp.readlines():
self.call_segment(text.strip())
fp.close()
开发者ID:fandywang,项目名称:python-wordsegmenter,代码行数:20,代码来源:max_prob_segmenter_test.py
示例10: __init__
def __init__(self, args, src_file, trg_file):
self.src_vocabulary = Vocabulary()
self.src_vocabulary.make_dictionary(src_file)
self.trg_vocabulary = Vocabulary()
self.trg_vocabulary.make_dictionary(trg_file)
self.src_size = len(self.src_vocabulary.wtoi)
self.embed_size = args.embed_size
self.hidden_size = args.hidden_size
self.trg_size = len(self.trg_vocabulary.wtoi)
super(EncoderDecoder, self).__init__(
# encoder
w_xe=F.EmbedID(self.src_size, self.embed_size),
w_ep=F.Linear(self.embed_size, self.hidden_size*4),
w_pp=F.Linear(self.hidden_size, self.hidden_size*4),
# decoder
w_ey=F.EmbedID(self.trg_size, self.embed_size),
w_qe=F.Linear(self.embed_size, self.hidden_size*4),
w_qq=F.Linear(self.hidden_size, self.hidden_size*4),
w_yq=F.Linear(self.hidden_size, self.trg_size),
)
开发者ID:lrlab,项目名称:LSTM,代码行数:23,代码来源:seq2seq.py
示例11: generate_dataset
def generate_dataset(items, slots, voca: Vocabulary):
dataset = Dataset()
for item in items:
vectors = []
for word in item[0].split():
vectors.append(voca.get(word))
labels = []
for tag in item[1].split():
value = np.zeros([len(slots)], dtype=np.float32)
value[slots.index(tag)] = 1
labels.append(value)
dataset.add(item[0], item[1], vectors, labels)
return dataset
开发者ID:fin10,项目名称:MachineLearningStudy,代码行数:16,代码来源:slot_tagger.py
示例12: __init__
def __init__(self, documents=None, vocab=None, frozen=None):
if documents:
self.documents = documents
else:
self.documents = []
if vocab:
self.vocab = vocab
else:
self.vocab = Vocabulary()
if frozen:
self.frozen = frozen
else:
self.frozen = False
开发者ID:hannawallach,项目名称:cmpsci691bm,代码行数:16,代码来源:corpus.py
示例13: open
def open(self, corpus_dir):
self.root_dir = corpus_dir
if not path.isdir(corpus_dir):
os.mkdir(corpus_dir)
self.meta_dir = self.root_dir + "/meta"
self.samples_dir = self.root_dir + "/samples"
if not path.isdir(self.samples_dir):
os.mkdir(self.samples_dir)
self.vocabulary_dir = self.root_dir + "/vocabulary"
self.vocabulary = Vocabulary(self.vocabulary_dir)
self.categories_dir = self.root_dir + "/categories"
self.categories = Categories(self.categories_dir)
self.categories.load_categories()
self.categories.print_categories()
开发者ID:uukuguy,项目名称:digger,代码行数:18,代码来源:corpus.py
示例14:
def test_antonym_valid_phrase_2(self):
current_result = vb.antonym("respect")
result = '{"text": ["disesteem", "disrespect"]}'
expected_result = json.loads(result)
if sys.version_info[:2] <= (2, 7):
self.assertItemsEqual(current_result, expected_result)
else:
self.assertCountEqual(current_result, expected_result)
开发者ID:Anhmike,项目名称:vocabulary,代码行数:8,代码来源:tests.py
示例15: test_pronunciation_valid_phrase
def test_pronunciation_valid_phrase(self):
current_result = vb.pronunciation("hippopotamus")
result = '[{"rawType": "ahd-legacy", "raw": "(hĭpˌə-pŏtˈə-məs)", "seq": 0}, {"rawType": "arpabet", "raw": "HH IH2 P AH0 P AA1 T AH0 M AH0 S", "seq": 0}]'
expected_result = json.loads(result)
if sys.version_info[:2] <= (2, 7):
self.assertItemsEqual(current_result, expected_result)
else:
self.assertCountEqual(current_result, expected_result)
开发者ID:Anhmike,项目名称:vocabulary,代码行数:8,代码来源:tests.py
示例16: synonyms
def synonyms(word):
try:
synonyms=''
result=json.loads(vb.synonym(word))
for res in result:
synonyms += res['text'] + ','
return synonyms[:-1] + '\n'
except:
return "N/A"
开发者ID:nikhilkumarsingh,项目名称:vocab_app,代码行数:9,代码来源:vocab.py
示例17: translate
def translate(text):
try:
translation=''
result=json.loads(vb.translate(text, "en","hi"))
for res in result:
translation += res['text'] + ','
return translation[:-1] + '\n'
except:
return "N/A"
开发者ID:nikhilkumarsingh,项目名称:vocab_app,代码行数:9,代码来源:vocab.py
示例18: test_synonym_valid_phrase
def test_synonym_valid_phrase(self):
current_result = vb.synonym("repudiate")
result = '[{"seq": 0, "text": "deny"}]'
middle_val = json.loads(result)
expected_result = json.dumps(middle_val)
if sys.version_info[:2] <= (2, 7):
self.assertItemsEqual(current_result, expected_result)
else:
self.assertCountEqual(current_result, expected_result)
开发者ID:Anhmike,项目名称:vocabulary,代码行数:9,代码来源:tests.py
示例19: test_translate_valid_phrase
def test_translate_valid_phrase(self):
current_result = vb.translate("hummus", "en", "es")
result = '[{"text": "hummus", "seq": 0}]'
middle_val = json.loads(result)
expected_result = json.dumps(middle_val)
if sys.version_info[:2] <= (2, 7):
self.assertItemsEqual(current_result, expected_result)
else:
self.assertCountEqual(current_result, expected_result)
开发者ID:Anhmike,项目名称:vocabulary,代码行数:9,代码来源:tests.py
示例20: test_hyphenation_valid_phrase
def test_hyphenation_valid_phrase(self):
current_result = vb.hyphenation("hippopotamus")
result = '[{"seq": 0, "text": "hip", "type": "secondary stress"}, {"seq": 1, "text": "po"}, {"seq": 2, "text": "pot", "type": "stress"}, {"seq": 3, "text": "a"}, {"seq": 4, "text": "mus"}]'
middle_val = json.loads(result)
expected_result = json.dumps(middle_val)
if sys.version_info[:2] <= (2, 7):
self.assertItemsEqual(current_result, expected_result)
else:
self.assertCountEqual(current_result, expected_result)
开发者ID:Anhmike,项目名称:vocabulary,代码行数:9,代码来源:tests.py
注:本文中的vocabulary.Vocabulary类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论