本文整理汇总了Python中utils.tokenize函数的典型用法代码示例。如果您正苦于以下问题:Python tokenize函数的具体用法?Python tokenize怎么用?Python tokenize使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了tokenize函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: predict_answers
def predict_answers(data, word2vec, N):
stop = stopwords.words('english')
pred_answs = []
pred_probs = [["A", "B", "C", "D"]]
for i in range(data.shape[0]):
#calculate word2vec for question
q_vec = np.zeros(N, dtype=float)
for w in tokenize(data['question'][i]):
if w.lower() in word2vec and w.lower() not in stop:
w2 = getword2vecval (N,w.lower(),word2vec)
q_vec = np.add(q_vec, w2)
q_vec = q_vec / linalg.norm(q_vec)
#calculate word2vec for answers
A_vec = np.zeros(N, dtype=float)
B_vec = np.zeros(N, dtype=float)
C_vec = np.zeros(N, dtype=float)
D_vec = np.zeros(N, dtype=float)
for w in tokenize(data['answerA'][i]):
if w.lower() in word2vec and w.lower() not in stop:
w2 = getword2vecval (N,w.lower(),word2vec)
#print (w2[0:4])
A_vec = np.add(A_vec,w2)
for w in tokenize(data['answerB'][i]):
if w.lower() in word2vec and w.lower() not in stop:
w2 = getword2vecval (N,w.lower(),word2vec)
#print (w2[0:4])
B_vec = np.add(B_vec,w2)
for w in tokenize(data['answerC'][i]):
if w.lower() in word2vec and w.lower() not in stop:
w2 = getword2vecval (N,w.lower(),word2vec)
#print (w2[0:4])
C_vec = np.add(C_vec,w2)
for w in tokenize(data['answerD'][i]):
if w.lower() in word2vec and w.lower() not in stop:
w2 = getword2vecval (N,w.lower(),word2vec)
#print (w2[0:4])
D_vec = np.add(D_vec,w2)
A_vec = A_vec / linalg.norm(A_vec)
B_vec = B_vec / linalg.norm(B_vec)
C_vec = C_vec / linalg.norm(C_vec)
D_vec = D_vec / linalg.norm(D_vec)
#choose question based on cosine distance
idx = np.concatenate((A_vec, B_vec, C_vec, D_vec)).reshape(4, N).dot(q_vec).argmax()
probs = np.concatenate((A_vec, B_vec, C_vec, D_vec)).reshape(4, N).dot(q_vec)
pred_answs.append(["A", "B", "C", "D"][idx])
pred_probs.append(probs)
return pred_answs, pred_probs
开发者ID:Sirorezka,项目名称:a-l-l-e-n-_-m-a-s-t-_-r,代码行数:57,代码来源:0__glove_predict.py
示例2: get_glove_features
def get_glove_features(data, word2vec, N):
stop = stopwords.words('english')
scores = []
for i in range(data.shape[0]):
#calculate word2vec for question
q_vec = np.zeros(N)
for w in tokenize(data['question'][i]):
if w.lower() in word2vec and w.lower() not in stop:
q_vec += word2vec[w.lower()]
# # get all synonyms of the word
# syns = wn.synsets(w.lower(), pos='n')
# if len(syns)>0:
# for syn in syns:
# sw = syn.lemma_names()[0]
# if sw.lower() in word2vec and sw.lower() not in stop:
# q_vec += word2vec[sw.lower()]
q_vec = q_vec / linalg.norm(q_vec)
#calculate word2vec for answers
A_vec = np.zeros(N)
B_vec = np.zeros(N)
C_vec = np.zeros(N)
D_vec = np.zeros(N)
for w in tokenize(data['answerA'][i]):
if w.lower() in word2vec and w.lower() not in stop:
A_vec += word2vec[w.lower()]
for w in tokenize(data['answerB'][i]):
if w.lower() in word2vec and w.lower() not in stop:
B_vec += word2vec[w.lower()]
for w in tokenize(data['answerC'][i]):
if w.lower() in word2vec and w.lower() not in stop:
C_vec += word2vec[w.lower()]
for w in tokenize(data['answerD'][i]):
if w.lower() in word2vec and w.lower() not in stop:
D_vec += word2vec[w.lower()]
A_vec = A_vec / linalg.norm(A_vec)
B_vec = B_vec / linalg.norm(B_vec)
C_vec = C_vec / linalg.norm(C_vec)
D_vec = D_vec / linalg.norm(D_vec)
scores.append(np.concatenate((A_vec, B_vec, C_vec, D_vec)).reshape(4, N).dot(q_vec))
return scores
开发者ID:johnkorn,项目名称:kaggle_allen,代码行数:54,代码来源:glove_predict.py
示例3: __build_dictionary
def __build_dictionary(synset, hyperhypo):
lesk_dictionary = []
# Includes definition.
lesk_dictionary+= tokenize(synset.definition)
# Includes lemma_names.
lesk_dictionary+= synset.lemma_names
# Optional: includes lemma_names of hypernyms and hyponyms.
if hyperhypo:
related_senses = synset.hypernyms()+synset.hyponyms()
for related_sense in related_senses:
lesk_dictionary+= tokenize(related_sense.definition)
lesk_dictionary+= [lemma.name for lemma in related_sense.lemmas]
without_stop_words = filter(lambda word: word not in english_stopwords , lesk_dictionary)
return map(lambda word: word.lower(), without_stop_words)
开发者ID:finiteautomata,项目名称:wisdom,代码行数:15,代码来源:lesk.py
示例4: generate_citations
def generate_citations(lines, vocab, index):
word2idx = dict([(v, k) for k, v in enumerate(vocab)])
for line in lines[:100]:
tokenized = list()
capitalized = list()
for word, cap in zip(utils.tokenize(line, periods=True), utils.tokenize(line, periods=True, capitalized=True)):
if word == '.':
if len(tokenized) > 10:
citation = generate_citation([word2idx[w] for w in tokenized if w in word2idx], index)
print(' '.join(capitalized) + ' (%s).' % citation)
tokenized = list()
capitalized = list()
else:
tokenized.append(word)
capitalized.append(cap)
开发者ID:codekansas,项目名称:citation-generator,代码行数:15,代码来源:execute.py
示例5: predict_segmented_tf_idf
def predict_segmented_tf_idf(data, docs_per_q, ids_and_categories):
#index docs
res = []
category_tf_idfs = {}
for index, row in data.iterrows():
current_id = str(row['id'])
print current_id
current_category = ids_and_categories[current_id]
if category_tf_idfs.get(current_category) is None:
category_tf_idfs[current_category] = utils.get_docstf_idf(wiki_docs_dir + '/%s' % current_category)
docs_tf, words_idf = category_tf_idfs[current_category]
#get answers words
w_A = set(utils.tokenize(row['answerA']))
w_B = set(utils.tokenize(row['answerB']))
w_C = set(utils.tokenize(row['answerC']))
w_D = set(utils.tokenize(row['answerD']))
sc_A = 0
sc_B = 0
sc_C = 0
sc_D = 0
q = row['question']
for d in zip(*utils.get_docs_importance_for_question(q, docs_tf, words_idf, docs_per_q))[0]:
for w in w_A:
if w in docs_tf[d]:
sc_A += 1. * docs_tf[d][w] * words_idf[w] # count of how many times in the document, times log(numberofdocs/word) for each word
for w in w_B:
if w in docs_tf[d]:
sc_B += 1. * docs_tf[d][w] * words_idf[w]
for w in w_C:
if w in docs_tf[d]:
sc_C += 1. * docs_tf[d][w] * words_idf[w]
for w in w_D:
if w in docs_tf[d]:
sc_D += 1. * docs_tf[d][w] * words_idf[w]
res.append(['A','B','C','D'][np.argmax([sc_A, sc_B, sc_C, sc_D])])
return res
开发者ID:Evanc123,项目名称:allen_ai,代码行数:48,代码来源:doc2vecpredict.py
示例6: testTokens
def testTokens(self):
tokens = utils.tokenize(self.str3)
self.assertEqual(11, len(tokens))
self.assertEqual('\n two empty spaces and some escaped chars \\\"\\\' in normal textfollowed by a ', tokens[0]['token'])
self.assertEqual('"dbl quote"', tokens[1]['token'])
self.assertEqual(' and then a ', tokens[2]['token'])
self.assertEqual("'single quote'", tokens[3]['token'])
self.assertEqual('\nwait there is more!! ', tokens[4]['token'])
self.assertEqual('"\'signle quotes\' inside a double quote"', tokens[5]['token'])
self.assertEqual(' and ', tokens[6]['token'])
self.assertEqual('\'"double quotes" inside a single quote\'', tokens[7]['token'])
self.assertEqual('\nwait! there\\\'s more!! ', tokens[8]['token'])
self.assertEqual('"escaped double quotes \\" and escaped single quotes\\\' "', tokens[9]['token'])
self.assertEqual(' ', tokens[10]['token'])
self.assertEqual(utils.TOKEN_NORMAL, tokens[0]['type'])
self.assertEqual(utils.TOKEN_DBL_Q, tokens[1]['type'])
self.assertEqual(utils.TOKEN_NORMAL, tokens[2]['type'])
self.assertEqual(utils.TOKEN_SNG_Q, tokens[3]['type'])
self.assertEqual(utils.TOKEN_NORMAL, tokens[4]['type'])
self.assertEqual(utils.TOKEN_DBL_Q, tokens[5]['type'])
self.assertEqual(utils.TOKEN_NORMAL, tokens[6]['type'])
self.assertEqual(utils.TOKEN_SNG_Q, tokens[7]['type'])
self.assertEqual(utils.TOKEN_NORMAL, tokens[8]['type'])
self.assertEqual(utils.TOKEN_DBL_Q, tokens[9]['type'])
self.assertEqual(utils.TOKEN_NORMAL, tokens[10]['type'])
开发者ID:engina,项目名称:jn-cpu,代码行数:25,代码来源:TestTokenizer.py
示例7: FrequentWords
def FrequentWords(data_dirs, suffixes, max_key_words):
"""
Returns a dictionary of min(max_key_words, percentile_key_words), giving key
word with its count.
"""
matches = matchingFiles(data_dirs, suffixes)
token_count = Counter()
files_done = 0
for file_name in matches:
tokens = tokenize(file_name)
for token in tokens:
if len(token) == 0:
continue
try:
token_count[token] += 1
except:
token_count[token] = 1
files_done += 1
if (files_done % 5000 == 0):
print("Completed parsing %d files ..." % files_done)
# num_key_words = min(max_key_words,
# math.ceil(percentile_key_words * len(token_count)))
return token_count.most_common(max_key_words)
开发者ID:subhasis256,项目名称:ml_code_completion,代码行数:25,代码来源:key_word_extractor.py
示例8: tag
def tag(self, text=None):
"""
Tags the given text.
:param text: a string or unicode object. Strings assumed to be utf-8
:returns: a list of lists (sentences with tokens).
Each sentence has (token, tag) tuples.
"""
result = []
if text:
tokens = utils.tokenize(text, clean=False)
for sent in tokens:
tags = self.tag_tokens(sent)
result.append(zip(sent, tags))
else:
# read tsv from stdin
sent = []
for line in sys.stdin:
line = line.decode('utf-8').strip()
if line:
sent.append(line.split()[0])
else:
tags = self.tag_tokens(sent)
result.append(zip(sent, tags))
sent = []
return result
开发者ID:attardi,项目名称:nlpnet,代码行数:27,代码来源:taggers.py
示例9: bird_info
def bird_info(self):
birdv = self.machine.run("echo | birdc | head -1").strip().replace(" ready.", "")
birdv = birdv.split(" ")
info = {
"daemon": birdv[0],
"version": birdv[1],
"ospf": {}
}
log.info("[%s] getting OSPF neighbours" % self.hostname())
output = self.machine.run("echo show ospf neighbors | birdc | sed '/^bird[^ ] .*/d'")
neighbours = []
for toks in [tokenize(l) for l in splitlines(output)[2:]]:
neighbour = {
"routerid": toks[0]
}
if toks[4][0] in ascii_letters:
neighbour["ifname"] = toks[4]
neighbour["v4addr"] = toks[5]
else:
neighbour["v4addr"] = toks[4]
neighbour["ifname"] = toks[5]
neighbours.append(neighbour)
info["ospf"]["neighbours"] = neighbours
return info
开发者ID:tegola-hubs,项目名称:dendria,代码行数:25,代码来源:rlogin.py
示例10: matchUp
def matchUp(self, token, ingredientRow):
"""
Returns our best guess of the match between the tags and the
words from the display text.
This problem is difficult for the following reasons:
* not all the words in the display name have associated tags
* the quantity field is stored as a number, but it appears
as a string in the display name
* the comment is often a compilation of different comments in
the display name
"""
ret = []
# strip parens from the token, since they often appear in the
# display_name, but are removed from the comment.
token = utils.normalizeToken(token)
decimalToken = self.parseNumbers(token)
for key, val in ingredientRow.iteritems():
if isinstance(val, basestring):
for n, vt in enumerate(utils.tokenize(val)):
if utils.normalizeToken(vt) == token:
ret.append(key.upper())
elif decimalToken is not None:
try:
if val == decimalToken:
ret.append(key.upper())
except:
pass
return ret
开发者ID:NYTimes,项目名称:ingredient-phrase-tagger,代码行数:35,代码来源:cli.py
示例11: generate_data
def generate_data(self, count, offset):
"""
Generates training data in the CRF++ format for the ingredient
tagging task
"""
df = pd.read_csv(self.opts.data_path)
df = df.fillna("")
start = int(offset)
end = int(offset) + int(count)
df_slice = df.iloc[start: end]
for index, row in df_slice.iterrows():
try:
# extract the display name
display_input = utils.cleanUnicodeFractions(row["input"])
tokens = utils.tokenize(display_input)
del(row["input"])
rowData = self.addPrefixes([(t, self.matchUp(t, row)) for t in tokens])
for i, (token, tags) in enumerate(rowData):
features = utils.getFeatures(token, i+1, tokens)
print utils.joinLine([token] + features + [self.bestTag(tags)])
# ToDo: deal with this
except UnicodeDecodeError:
pass
print
开发者ID:NYTimes,项目名称:ingredient-phrase-tagger,代码行数:31,代码来源:cli.py
示例12: classify_proba
def classify_proba(self, text):
token_list = tokenize(text)
token_list = del_stopwords(token_list, self.stopset)
wordfreq_dict = stat_wordfreq(token_list)
dictfeats = tfidf(wordfreq_dict, self.idf_dict)
vecfeats = self.vectorizer.transform(dictfeats).toarray()
prob = self.classifier.predict_proba(vecfeats)
return prob[0]
开发者ID:Lonesome-George,项目名称:nlp_project1,代码行数:8,代码来源:jc_model.py
示例13: macaddr
def macaddr(self, iface):
output = self.machine.run("ip link show dev %s | grep link/ether" % iface).strip()
if not output:
return None
mac = tokenize(output)[1].upper()
if len(mac.replace("0", "").replace(":", "")) == 0:
return None
return mac
开发者ID:tegola-hubs,项目名称:dendria,代码行数:8,代码来源:rlogin.py
示例14: find_similar_articles
def find_similar_articles(corpus_name, method, content, data_dir=os.getcwd(), index=None):
"""
- corpus_name : Le nom du corpus sur lequel on travaille (fichier .tsv
sans l'extension .tsv)
- method : ldan (n = le nombre de topics), lsin ou tfidf
- content : un texte
Renvoie les 5 articles de corpus_name les plus proches du contenu spécifié
"""
corpus_file = os.path.join(data_dir, corpus_name + '_' + method + '.mm')
index_file = os.path.join(data_dir, corpus_name + '_' + method + '_index')
docid_file = os.path.join(data_dir, corpus_name + '_docid.txt')
# Chargement du corpus
try:
corpus = corpora.mmcorpus.MmCorpus(corpus_file)
except Exception:
raise IOError('Impossible de charger le fichier %s. Avez-vous bien appliqué le script corpus_to_matrix.py ?' % (corpus_file))
# Chargement du fichier d'index, s'il n'est pas fourni en argument
if not index:
try:
index = similarities.docsim.Similarity.load(index_file)
except Exception:
raise IOError("""Impossible de charger le fichier %s. Avez-vous bien appliqué le script %s avec l'option --saveindex ?""" % (method, index_file))
dico_file = os.path.join(data_dir, corpus_name + '_wordids.txt')
# Chargement du dictionnaire
try:
id2word = corpora.dictionary.Dictionary.load_from_text(dico_file)
except Exception:
raise IOError("Impossible de charger le fichier %s" % (dico_file))
# Chargement du modèle correspondant à la méthode voulue par l'utilisateur
if method == 'tfidf':
model_file = os.path.join(data_dir, corpus_name + '_tfidf_model')
model = models.tfidfmodel.TfidfModel.load(model_file)
elif method.startswith('lsi'):
model_file = os.path.join(data_dir, corpus_name + '_' + args.method + '_model')
model = models.lsimodel.LsiModel.load(model_file)
elif method.startswith('lda'):
model_file = os.path.join(data_dir, corpus_name + '_' + args.method + '_model')
model = models.ldamodel.LdaModel.load(model_file)
tokens = model[id2word.doc2bow(utils.tokenize(content))]
# Renvoi des 5 articles les plus proches
sims = index[tokens]
sims = sorted(enumerate(sims), key=lambda item: -item[1])
return json.dumps([{'id': utils.get_article_by_corpus_number(x[0], docid_file), 'score': round(x[1], 2)} for x in sims[:5]])
开发者ID:fchantrel,项目名称:habeascorpus,代码行数:58,代码来源:similar_articles.py
示例15: word_freq
def word_freq(filenames, stopset):
wordset = set() # 全部单词集
freqset_list = [[],[]] # 分别保存负向和正向文本的词频
npos = 0 # 当前正向文本的数目
nneg = 0 # 当前负向文本的数目
icur = 0 # 当前所指向的正向或负向文本的下标
for filename in filenames:
fr = file(filename, 'r')
while True:
line = fr.readline().decode("utf-8")
if len(line) == 0: # Zero length indicates EOF
break
id,label,text = proc_line(line)
token_list = tokenize(text)
token_list = del_stopwords(token_list, stopset)
wordfreq_dict = {}
for token in token_list:
wordset.add(token) # 将单词加入全部单词集
if wordfreq_dict.has_key(token):
wordfreq_dict[token] += 1
else:
wordfreq_dict[token] = 1
doc = [id, label, wordfreq_dict] # 用列表记录每篇文本的id,label和词频
# 将文本加入指定列表
index = 0
if label == '1':
index = 1
freqset_list[1].append(doc)
icur = npos
npos += 1
elif label == '-1':
index = 0
freqset_list[0].append(doc)
icur = nneg
nneg += 1
else:
print 'tag-unknown text'
continue
fr.close()
# 将特征词保存至文件中
f = open('./Training/WordSet.txt', 'w')
for word in wordset:
string = word + '\n'
f.write(string.encode("utf-8"))
f.close()
# 将原始词频保存至文件中
f = open('./Training/WordFreq_Orig.txt', 'w')
for i in range(2):
for freqset in freqset_list[i]:
id = freqset[0]
label = freqset[1]
freq_list = freqset[2]
string = id + '\t' + label + '\t'
for word in freq_list:
string += word + ',' + str(freq_list[word]) + ';'
string += '\n'
f.write(string.encode('utf-8'))
return wordset, freqset_list
开发者ID:Lonesome-George,项目名称:nlp_project1,代码行数:58,代码来源:extract_features.py
示例16: v4addr
def v4addr(self, iface):
output = self.machine.run("ip addr show dev %s | grep '^ *inet '" % iface).strip()
def parseaddr(a):
a = a.strip()
if "/" not in a:
return a + "/32"
return a
tokset = [tokenize(l) for l in splitlines(output)]
return [parseaddr(toks[1]) for toks in tokset if len(toks) > 0]
开发者ID:tegola-hubs,项目名称:dendria,代码行数:9,代码来源:rlogin.py
示例17: dict_from_file
def dict_from_file(filename, match_case=True):
d = defaultdict(list)
with codecs.open(DICTS_DIR + filename, 'rb', encoding='utf8') as f:
for line in f:
tokens = tokenize(normalize(line, lowercase=(not match_case)),
split_alphanum=split_alphanum)
for (nb, token) in enumerate(tokens):
d[token] += [(tokens, nb)]
return (d, match_case)
开发者ID:donvel,项目名称:affiliations,代码行数:9,代码来源:export.py
示例18: find_word_freq
def find_word_freq(li):
all_tokens = [normalize(t, lowercase=False)
for aff in li
for t in tokenize(text_in_element(aff),
split_alphanum=split_alphanum)]
freq = defaultdict(int)
for token in all_tokens:
freq[token] += 1
return freq
开发者ID:donvel,项目名称:affiliations,代码行数:9,代码来源:export.py
示例19: predict_answers
def predict_answers(data, word2vec, N):
stop = stopwords.words('english')
pred_answs = []
for i in range(data.shape[0]):
#calculate word2vec for question
q_vec = np.zeros(N)
for w in tokenize(data['question'][i]):
if w.lower() in word2vec and w.lower() not in stop:
q_vec += word2vec[w.lower()]
q_vec = q_vec / linalg.norm(q_vec)
#calculate word2vec for answers
A_vec = np.zeros(N)
B_vec = np.zeros(N)
C_vec = np.zeros(N)
D_vec = np.zeros(N)
for w in tokenize(data['answerA'][i]):
if w.lower() in word2vec and w.lower() not in stop:
A_vec += word2vec[w.lower()]
for w in tokenize(data['answerB'][i]):
if w.lower() in word2vec and w.lower() not in stop:
B_vec += word2vec[w.lower()]
for w in tokenize(data['answerC'][i]):
if w.lower() in word2vec and w.lower() not in stop:
C_vec += word2vec[w.lower()]
for w in tokenize(data['answerD'][i]):
if w.lower() in word2vec and w.lower() not in stop:
D_vec += word2vec[w.lower()]
A_vec = A_vec / linalg.norm(A_vec)
B_vec = B_vec / linalg.norm(B_vec)
C_vec = C_vec / linalg.norm(C_vec)
D_vec = D_vec / linalg.norm(D_vec)
#choose question based on cosine distance
idx = np.concatenate((A_vec, B_vec, C_vec, D_vec)).reshape(4, N).dot(q_vec).argmax()
pred_answs.append(["A", "B", "C", "D"][idx])
return pred_answs
开发者ID:5vision,项目名称:kaggle_allen,代码行数:44,代码来源:glove_predict.py
示例20: build_vocab
def build_vocab(docs, save_as):
start = time.time()
vocab = set()
for file in utils.iterate_corpus(docs):
with open(file, 'r') as f:
tokenized = itertools.chain.from_iterable(utils.tokenize(line) for line in f.readlines())
vocab.update(tokenized)
vocab = list(vocab)
pkl.dump(vocab, open(save_as, 'wb'))
print('Built vocabulary and saved it to "%s" in %s' % (save_as, utils.strtime(time.time() - start)), file=sys.stderr)
return vocab
开发者ID:codekansas,项目名称:citation-generator,代码行数:11,代码来源:build.py
注:本文中的utils.tokenize函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论