本文整理汇总了Python中tokenizer.tokenize函数的典型用法代码示例。如果您正苦于以下问题:Python tokenize函数的具体用法?Python tokenize怎么用?Python tokenize使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了tokenize函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: normalize_data
def normalize_data(stem = True):
global contexts
for word in words:
# converting each sense-definition pair to sense-normalized_definition_tokens
words[word] = map(lambda pair: [pair[0], tokenize(pair[1], stem)], words[word])
# Normalizing contexts as well similarly
contexts = map(lambda triple: [triple[0], triple[1], tokenize(triple[2], stem)], contexts)
开发者ID:mmbrian,项目名称:snlp_ss15,代码行数:7,代码来源:wsd.py
示例2: _clean
def _clean(self, source, target, source_cleaned, target_cleaned, m):
self.info('Cleaning...')
source_in = codecs.open(source, 'rb', 'utf-8')
target_in = codecs.open(target, 'rb', 'utf-8')
source_out = codecs.open(source_cleaned, 'wb', 'utf-8')
target_out = codecs.open(target_cleaned, 'wb', 'utf-8')
for num_lines, _ in enumerate(source_in): pass
source_in.seek(0, 0)
pbar = ProgressBar(maxval=num_lines).start()
for l in count(0):
source_line = source_in.readline()
target_line = target_in.readline()
if not source_line or not target_line:
break
source_tokens = tokenize(source_line)
target_tokens = tokenize(target_line)
if len(source_tokens) == 0 or len(source_tokens) > m \
or len(target_tokens) == 0 or len(target_tokens) > m:
continue
source_out.write(' '.join(source_tokens) + '\n')
target_out.write(' '.join(target_tokens) + '\n')
pbar.update(l)
pbar.finish()
source_in.close()
target_in.close()
source_out.close()
target_out.close()
开发者ID:amitshaked,项目名称:translate,代码行数:34,代码来源:phrase_table.py
示例3: get_msr_feats
def get_msr_feats(corpus):
feats1 = []
feats2 = []
for sample in corpus:
words1 = [word.lower() for word in tokenize(sample[1])]
words2 = [word.lower() for word in tokenize(sample[2])]
feats1.append([words1])
feats2.append([words2])
return feats1, feats2
开发者ID:hiroki13,项目名称:neural-sentence-matching-system,代码行数:9,代码来源:feat_factory.py
示例4: get_tokens
def get_tokens(obj):
if isinstance(obj, basestring):
return tokenize(obj)
elif isinstance(obj, file):
return tokenize(obj)
else:
# object not valid
raise TypeError('Got unexpected object type {0!r}'.format(
obj.__class__.__name__))
开发者ID:pradyunsg,项目名称:PyTML,代码行数:9,代码来源:main.py
示例5: parse_paragraph
def parse_paragraph(parag, mim_tags, fast_p):
""" Parse a single paragraph in free text form and compare to MIM POS tags """
tokens = tokenize(parag)
tlist = list(tokens)
result = parse_tokens(tlist, mim_tags, fast_p)
print("{0}\n--> {1} sentences, {2} parsed".format(parag, result["num_sent"], result["num_parsed_sent"]))
开发者ID:halldor,项目名称:Reynir,代码行数:7,代码来源:mim.py
示例6: test_simple_file
def test_simple_file(self):
input = """#include GLFW_INCLUDE_GLU
#include <GLFW/glfw3.h>
#include <cstdio>
/* Random function */
static void glfw_key_callback(int key, int scancode, int action, int mod){
if(glfw_key_callback){
// Comment here
input_event_queue->push(inputaction);
}
}"""
(final_stats, final_tokens, file_times) = tokenizer.tokenize(input, comment_inline_pattern, comment_open_close_pattern, separators)
(file_hash,lines,LOC,SLOC) = final_stats
(tokens_count_total,tokens_count_unique,token_hash,tokens) = final_tokens
self.assertEqual(lines,11)
self.assertEqual(LOC,10)
self.assertEqual(SLOC,8)
self.assertEqual(tokens_count_total,24)
self.assertEqual(tokens_count_unique,18)
self.assert_common_properties(tokens)
hard_tokens = set(['[email protected]@::@@4','[email protected]@::@@1','[email protected]@::@@1','[email protected]@::@@1','[email protected]@::@@1','[email protected]@::@@1','[email protected]@::@@1','[email protected]@::@@1','[email protected]@::@@1','[email protected]@::@@1','[email protected]@::@@1','[email protected]@::@@1','[email protected]@::@@1','[email protected]@::@@2','[email protected]@::@@1','[email protected]@::@@1','[email protected]@::@@1','[email protected]@::@@3'])
this_tokens = set(tokens[3:].split(','))
self.assertTrue(len(hard_tokens - this_tokens),0)
m = hashlib.md5()
m.update(tokens[3:])
self.assertEqual(m.hexdigest(),token_hash)
开发者ID:Mondego,项目名称:SourcererCC,代码行数:31,代码来源:tokenizer-unit-test.py
示例7: query
def query(query, offset, rpp):
# Load the indexed data
ids = pickle.load(open(config.data_directory + '/monuments.ids', 'r'))
dictionary = corpora.Dictionary.load(config.data_directory + '/monuments.dict')
corpus = corpora.MmCorpus(config.data_directory + '/monuments.mm')
lsi = models.LsiModel.load(config.data_directory + '/monuments.lsi')
tfidf = models.TfidfModel.load(config.data_directory + '/monuments.tfidf')
tfidfIndex = similarities.Similarity.load(config.data_directory + '/monuments.tfidf.index')
lsiIndex = similarities.Similarity.load(config.data_directory + '/monuments.lsi.index')
# Convert query to a tokenized document and project it as a vector in tfidf and lsi
tokenized = tokenizer.tokenize(query)
vector = dictionary.doc2bow(tokenized)
tfidf_vector = tfidf[vector]
lsi_vector = lsi[vector]
# Determine how similar the query vector is to the other documents in
# the same spaces (tfidf and lsi), and select the most similar documents
tfidf_similarity = tfidfIndex[tfidf_vector]
lsi_similarity = lsiIndex[lsi_vector]
similarity = np.array(lsi_similarity) * np.array(tfidf_similarity)
similarity = sorted(enumerate(similarity), key=lambda item: -item[1])
sims = similarity
sims = [s for s in sims if s[1] > 0]
offset = int(min(offset, len(sims)))
results = [str(ids[sim[0]]) for sim in sims[offset:int(min(offset+rpp, len(sims)))]]
# Print json result
print json.dumps({
'nrOfResults': len(sims),
'startResult': offset,
'endResult': min(offset+rpp, len(sims)),
'results': results})
开发者ID:rjagerman,项目名称:Monubit,代码行数:34,代码来源:query.py
示例8: build_lattice
def build_lattice(self, pt, sentence):
'''
Gets a phrase table and the tokenized sentence and outputs a lattice
file formatted as follows:
whole sentence
1-1:
<English translation> <Translation score>
<English translation> <Translation score>
...
1-2:
<English translation> <Translation score>
<English translation> <Translation score>
...
2-2:
The spans n-n refer to the tokens of the input Spanish sentence
'''
sentence = tokenize(sentence)
self.sentence = sentence
for start in xrange(len(sentence)):
self.phrases[start] = {}
for end in xrange(start+1, len(sentence)+1):
foreign = sentence[start:end]
p = Phrase(foreign, start, end)
if len(foreign) == 1 and foreign[0] == ',':
p.translations = [Translation(foreign, (',',), 0)]
else:
p.translations = pt.translate(foreign)
self.phrases[start][end] = p
开发者ID:amitshaked,项目名称:translate,代码行数:29,代码来源:translation_lattice.py
示例9: extractCoordinates
def extractCoordinates(self):
self.inputfile = open(self.ifilename, "r")
line = self.inputfile.readline()
coords_times_list = []
i = 0
while len(line) > 0:
i = i + 1
#print i
try:
tweet = jsonpickle.decode(line)
except ValueError, e:
print repr(e)
line = self.inputfile.readline()
continue
if tweet.has_key("delete") or tweet.has_key("scrub_geo") or tweet.has_key("limit"):
print "unimplemented data item"
else:
#print tweet["text"]
text = tweet["text"]
tweet_w = time.strptime(tweet["created_at"], "%a %b %d %H:%M:%S +0000 %Y")
tokens = tokenizer.tokenize(text)
if tweet.has_key("coordinates"):
coord = tweet["coordinates"]
if coord == None:
print "coordinates null"
elif coord.has_key("type") and coord["type"] == "Point":
coords_times_list.append([coord["coordinates"], tweet_w])
else:
print "not a point"
line = self.inputfile.readline()
开发者ID:cgl,项目名称:sna-tools,代码行数:30,代码来源:stats.py
示例10: indexDir
def indexDir(dirname):
basename = os.path.basename(dirname.rstrip("/"))
indexFile = open('./indexes/%s_index' % basename, 'w');
idMap = {}
indexDict = {}
docId = 0
for (root, dirnames, filenames) in os.walk(dirname):
for filename in filenames:
if (re.search("\.sw[op]$", filename) == None):
with open(os.path.join(root, filename), 'r') as fh:
idMap[docId] = filename
tokens = tokenize(fh)
for (pos, token) in tokens:
token = stem(alias(token))
try:
positionMap = indexDict[token]
try:
positionMap[docId].append(pos)
except KeyError:
positionMap[docId] = [pos]
except KeyError:
indexDict[token] = {docId: [pos]}
docId += 1
fullIndex = {"id_map" : idMap, "index" : indexDict}
indexFile.write(json.dumps(fullIndex))
indexFile.close()
开发者ID:fiono,项目名称:spotless_mind,代码行数:30,代码来源:indexer.py
示例11: __init__
def __init__(self, source):
self.numTemps = 0
self.macros = []
self.mlMacros = []
for mem in dir(self):
mem = getattr(self, mem)
if isinstance(mem, type) and issubclass(mem, Macro):
if issubclass(mem, MLMacro):
self.mlMacros.append(mem(self))
else:
self.macros.append(mem(self))
self.macros.sort()
self.mlMacros.sort()
tokens = tokenizer.tokenize(source)
pprint.pprint(tokens)
code = self.compile(tokens)
pprint.pprint(code)
code = Module(
None,
Stmt(code)
)
set_filename('<macropy>', code)
self.compiled = ModuleCodeGenerator(code).getCode()
开发者ID:GrimDerp,项目名称:metapy,代码行数:27,代码来源:mcompiler.py
示例12: test_lt_gt
def test_lt_gt(self):
test_str = "(and (< 3 5) true)"
actual_tokens = tokenize(test_str)
consumed, remaining = S(actual_tokens)
code = generate_code(consumed)
print test_str
print code
开发者ID:fridgei,项目名称:CS480,代码行数:7,代码来源:unit_tests.py
示例13: test_math1
def test_math1(self):
test_str = "(+ (- 234.3 1.1) 23)"
actual_tokens = tokenize(test_str)
consumed, remaining = S(actual_tokens)
code = generate_code(consumed)
print test_str
print code
开发者ID:fridgei,项目名称:CS480,代码行数:7,代码来源:unit_tests.py
示例14: getScore
def getScore(newtitle):
query = tokenizer.tokenize(newtitle)
res = idx.queryVector(query, 1)
#print("{0} results.".format(len(res)))
# Take average of (upvotes-downvotes) weighted by similarity score ^ 2
# but only for posts with simscore > max(simscore)/2
totalweight = 0.0
totalscore = 0.0
for n in res:
simscore = n[1]
simscore = simscore ** 0.5
#if simscore < 0.75:
# continue
post = postdata.posts[n[0]]
#score = post["day"][1] - post["day"][2] # ups - downs^2
#score = post["day"][1]
#score = post["day"][1] + post["num_comments"]
score = post["day"][1] - post["day"][2] + post["num_comments"]*2.5
totalscore += float(score) * simscore
totalweight += simscore
#return float(score) # test
if totalweight == 0:
return 0.0 # couldn't make a score for this
finalscore = (totalscore / totalweight)
return finalscore
开发者ID:keitase,项目名称:EECS498-term-project,代码行数:32,代码来源:query.py
示例15: test_math2
def test_math2(self):
test_str = "(* 34 (- 23 45))"
actual_tokens = tokenize(test_str)
consumed, remaining = S(actual_tokens)
code = generate_code(consumed)
print test_str
print code
开发者ID:fridgei,项目名称:CS480,代码行数:7,代码来源:unit_tests.py
示例16: test_while
def test_while(self):
test_str = "(while (< 3 i) (assign i (+ i 1)))"
expected_tokens = [Token("(", "L_PAREN"),
Token("while", "E_WHILE"),
Token("(", "L_PAREN"),
Token("<", "O_LT"),
Token("3", "V_INT"),
Token("i", "V_STRING"),
Token(")", "R_PAREN"),
Token("(", "L_PAREN"),
Token("assign", "E_ASSIGN"),
Token("i", "V_STRING"),
Token("(", "L_PAREN"),
Token("+", "O_ADD"),
Token("i", "V_STRING"),
Token("1", "V_INT"),
Token(")", "R_PAREN"),
Token(")", "R_PAREN"),
Token(")", "R_PAREN")]
actual_tokens = tokenize(test_str)
for actual, expected in izip(actual_tokens, expected_tokens):
self.assertEqual(actual, expected)
consumed, remaining = S(actual_tokens)
self.assertEqual(remaining, [])
if not remaining:
print "accepted"
print consumed
开发者ID:fridgei,项目名称:CS480,代码行数:27,代码来源:tests.py
示例17: guess
def guess(self, text):
doc_counts = {}
doc_inverse_counts = {}
tokens = tokenize(text)
scores = {}
for label in self.labels:
doc_counts[label] = self.doc_count(label)
doc_inverse_counts[label] = self.doc_inverse_count(label)
total = self.total_doc_count()
for label in self.labels:
logSum = 0.0
for word in tokens:
stem_total_count = self.stem_total_count(word)
if stem_total_count == 0.0:
continue
else:
word_prob = self.stem_label_count(label, word) / doc_counts[label]
word_inverse_prob = self.stem_inverse_label_count(label, word) / doc_inverse_counts[label]
wordicity = word_prob / (word_prob + word_inverse_prob)
wordicity = (( 1.0 * 0.5) + (stem_total_count * wordicity) ) / (1.0 + stem_total_count )
if wordicity == 0.0:
wordicity = 0.01
elif wordicity == 1:
wordicity = 0.99
try:
logSum += math.log(1.0 - wordicity) - math.log(wordicity)
except ValueError:
print "ValueError"
try:
scores[label] = 1.0 / (1.0 + math.exp(logSum))
except OverflowError:
print "OverflowError"
return scores
开发者ID:Radahika,项目名称:Persimmon,代码行数:34,代码来源:train.py
示例18: test
def test(self, path):
corp = Corpus(path)
bs = Bayesian()
count = 0
sender_bl = load_pickle('sender_bl.pickle')
# scan email and define if msg is SPAM or HAM
# first check if sender occurs in sender Blacklist
# then count spamicity of the word using the Bayes approach
for fname, body in corp.emails():
sender = find_sender(body)
if sender in sender_bl:
self.tag_it(path, fname, 'SPAM')
continue
spamicity_list = []
count += 1
tokens = tokenize(body)
# compute spamicity for each word and create list of the values
for el in tokens:
word_spamicity = [el, bs.word_spamicity(el)]
spamicity_list.append(word_spamicity)
# prepare list for Bayes
spamicity_list = [list(i) for i in set(map(tuple, spamicity_list))] # remove duplicates from list
spamicity_list.sort(key=lambda x: abs(0.5 - x[1]), reverse=True)
prediction = bs.bayes_pred(spamicity_list[:15]) # Consider only 15 'words'
if prediction > 0.9 or sender in sender_bl:
self.tag_it(path, fname, 'SPAM')
else:
self.tag_it(path, fname, 'OK')
开发者ID:unacau,项目名称:bayesian-spam-filtering,代码行数:29,代码来源:filter.py
示例19: compute_ave_words_in_sentence
def compute_ave_words_in_sentence(self):
sentences = tokenizer.split_sentence(self.text)
average = 0
for sentence in sentences:
average += len(tokenizer.tokenize(sentence))
self.ave_words_in_sentence = 1.0 * average / len(sentences)
return self.ave_words_in_sentence
开发者ID:haknsahn,项目名称:authorRecognizer,代码行数:7,代码来源:document.py
示例20: _parse
def _parse(self, path, content, addWords):
words = tokenizer.tokenize(path, content)
wordList = []
currNode = ParseNode(path, 0, None)
currLine = [0, currNode]
nodeId = 1
for token, start, type in words:
if type == tokenizer.NOTHING:
if addWords:
self.words.add(token)
wordList.append((token, start, currLine))
elif type == tokenizer.NEWLINE:
wordList.append(('\\n', start, currLine))
prevLine = currLine
currLine = [currLine[0]+1, currNode]
elif type == tokenizer.DEDENT:
wordList.append(('\\d', start, currLine))
currNode = currNode.parent
currLine[1] = currNode
elif type == tokenizer.INDENT:
wordList.append(('\\i', start, currLine))
currNode = ParseNode(path, nodeId, currNode)
nodeId += 1
prevLine[1] = currNode
currLine[1] = currNode
if len(wordList) == 0:
wordList.append(('\\n', 0, currLine))
return wordList
开发者ID:pokey,项目名称:smartAutocomplete,代码行数:28,代码来源:Classifier.py
注:本文中的tokenizer.tokenize函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论