• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

Python tokenizer.tokenize函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中tokenizer.tokenize函数的典型用法代码示例。如果您正苦于以下问题:Python tokenize函数的具体用法?Python tokenize怎么用?Python tokenize使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了tokenize函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: normalize_data

def normalize_data(stem = True):
	global contexts
	for word in words:
		# converting each sense-definition pair to sense-normalized_definition_tokens
		words[word] = map(lambda pair: [pair[0], tokenize(pair[1], stem)], words[word])
	# Normalizing contexts as well similarly
	contexts = map(lambda triple: [triple[0], triple[1], tokenize(triple[2], stem)], contexts)
开发者ID:mmbrian,项目名称:snlp_ss15,代码行数:7,代码来源:wsd.py


示例2: _clean

    def _clean(self, source, target, source_cleaned, target_cleaned, m):
        self.info('Cleaning...')
        source_in = codecs.open(source, 'rb', 'utf-8')
        target_in = codecs.open(target, 'rb', 'utf-8')
        source_out = codecs.open(source_cleaned, 'wb', 'utf-8')
        target_out = codecs.open(target_cleaned, 'wb', 'utf-8')

        for num_lines, _ in enumerate(source_in): pass
        source_in.seek(0, 0)

        pbar = ProgressBar(maxval=num_lines).start()
        for l in count(0):
            source_line = source_in.readline()
            target_line = target_in.readline()

            if not source_line or not target_line:
                break

            source_tokens = tokenize(source_line)
            target_tokens = tokenize(target_line)

            if len(source_tokens) == 0 or len(source_tokens) > m \
                    or len(target_tokens) == 0 or len(target_tokens) > m:
                continue

            source_out.write(' '.join(source_tokens) + '\n')
            target_out.write(' '.join(target_tokens) + '\n')
            pbar.update(l)
        pbar.finish()

        source_in.close()
        target_in.close()
        source_out.close()
        target_out.close()
开发者ID:amitshaked,项目名称:translate,代码行数:34,代码来源:phrase_table.py


示例3: get_msr_feats

def get_msr_feats(corpus):
    feats1 = []
    feats2 = []
    for sample in corpus:
        words1 = [word.lower() for word in tokenize(sample[1])]
        words2 = [word.lower() for word in tokenize(sample[2])]
        feats1.append([words1])
        feats2.append([words2])
    return feats1, feats2
开发者ID:hiroki13,项目名称:neural-sentence-matching-system,代码行数:9,代码来源:feat_factory.py


示例4: get_tokens

def get_tokens(obj):
    if isinstance(obj, basestring):
        return tokenize(obj)
    elif isinstance(obj, file):
        return tokenize(obj)
    else:
        # object not valid
        raise TypeError('Got unexpected object type {0!r}'.format(
            obj.__class__.__name__))
开发者ID:pradyunsg,项目名称:PyTML,代码行数:9,代码来源:main.py


示例5: parse_paragraph

def parse_paragraph(parag, mim_tags, fast_p):
    """ Parse a single paragraph in free text form and compare to MIM POS tags """

    tokens = tokenize(parag)
    tlist = list(tokens)
    result = parse_tokens(tlist, mim_tags, fast_p)
    print("{0}\n--> {1} sentences, {2} parsed".format(parag, result["num_sent"], result["num_parsed_sent"]))
开发者ID:halldor,项目名称:Reynir,代码行数:7,代码来源:mim.py


示例6: test_simple_file

    def test_simple_file(self):
        input = """#include GLFW_INCLUDE_GLU
                   #include <GLFW/glfw3.h>
                   #include <cstdio>
                   
                   /* Random function */
                   static void glfw_key_callback(int key, int scancode, int action, int mod){
                     if(glfw_key_callback){
                       // Comment here
                       input_event_queue->push(inputaction);   
                     }
                   }"""
        (final_stats, final_tokens, file_times) = tokenizer.tokenize(input, comment_inline_pattern, comment_open_close_pattern, separators)
        (file_hash,lines,LOC,SLOC) = final_stats
        (tokens_count_total,tokens_count_unique,token_hash,tokens) = final_tokens

        self.assertEqual(lines,11)
        self.assertEqual(LOC,10)
        self.assertEqual(SLOC,8)

        self.assertEqual(tokens_count_total,24)
        self.assertEqual(tokens_count_unique,18)
        self.assert_common_properties(tokens)

        hard_tokens = set(['[email protected]@::@@4','[email protected]@::@@1','[email protected]@::@@1','[email protected]@::@@1','[email protected]@::@@1','[email protected]@::@@1','[email protected]@::@@1','[email protected]@::@@1','[email protected]@::@@1','[email protected]@::@@1','[email protected]@::@@1','[email protected]@::@@1','[email protected]@::@@1','[email protected]@::@@2','[email protected]@::@@1','[email protected]@::@@1','[email protected]@::@@1','[email protected]@::@@3'])
        this_tokens = set(tokens[3:].split(','))
        self.assertTrue(len(hard_tokens - this_tokens),0)

        m = hashlib.md5()
        m.update(tokens[3:])
        self.assertEqual(m.hexdigest(),token_hash)
开发者ID:Mondego,项目名称:SourcererCC,代码行数:31,代码来源:tokenizer-unit-test.py


示例7: query

def query(query, offset, rpp):

    # Load the indexed data
    ids = pickle.load(open(config.data_directory + '/monuments.ids', 'r'))
    dictionary = corpora.Dictionary.load(config.data_directory + '/monuments.dict')
    corpus = corpora.MmCorpus(config.data_directory + '/monuments.mm') 
    lsi = models.LsiModel.load(config.data_directory + '/monuments.lsi')
    tfidf = models.TfidfModel.load(config.data_directory + '/monuments.tfidf')
    tfidfIndex = similarities.Similarity.load(config.data_directory + '/monuments.tfidf.index')
    lsiIndex = similarities.Similarity.load(config.data_directory + '/monuments.lsi.index')

    # Convert query to a tokenized document and project it as a vector in tfidf and lsi
    tokenized = tokenizer.tokenize(query)
    vector = dictionary.doc2bow(tokenized)
    tfidf_vector = tfidf[vector]
    lsi_vector = lsi[vector]
    
    # Determine how similar the query vector is to the other documents in
    # the same spaces (tfidf and lsi), and select the most similar documents
    tfidf_similarity = tfidfIndex[tfidf_vector]
    lsi_similarity = lsiIndex[lsi_vector]
    similarity = np.array(lsi_similarity) * np.array(tfidf_similarity)
    similarity = sorted(enumerate(similarity), key=lambda item: -item[1])
    sims = similarity
    sims = [s for s in sims if s[1] > 0]
    offset = int(min(offset, len(sims)))
    results = [str(ids[sim[0]]) for sim in sims[offset:int(min(offset+rpp, len(sims)))]]
    
    # Print json result
    print json.dumps({
        'nrOfResults': len(sims),
        'startResult': offset,
        'endResult': min(offset+rpp, len(sims)),
        'results': results})
开发者ID:rjagerman,项目名称:Monubit,代码行数:34,代码来源:query.py


示例8: build_lattice

    def build_lattice(self, pt, sentence):
        '''
        Gets a phrase table and the tokenized sentence and outputs a lattice
        file formatted as follows:
            whole sentence
            1-1:
            <English translation> <Translation score>
            <English translation> <Translation score>
            ...
            1-2:
            <English translation> <Translation score>
            <English translation> <Translation score>
            ...
            2-2:

        The spans n-n refer to the tokens of the input Spanish sentence
        '''
        sentence = tokenize(sentence)
        self.sentence = sentence
        for start in xrange(len(sentence)):
            self.phrases[start] = {}
            for end in xrange(start+1, len(sentence)+1):
                foreign = sentence[start:end]
                p = Phrase(foreign, start, end)
                if len(foreign) == 1 and foreign[0] == ',':
                    p.translations = [Translation(foreign, (',',), 0)]
                else:
                    p.translations = pt.translate(foreign)
                self.phrases[start][end] = p
开发者ID:amitshaked,项目名称:translate,代码行数:29,代码来源:translation_lattice.py


示例9: extractCoordinates

 def extractCoordinates(self):
     self.inputfile = open(self.ifilename, "r") 
     line = self.inputfile.readline()
     coords_times_list = []
     i = 0
     while len(line) > 0:
         i = i + 1
         #print i
         try:
             tweet = jsonpickle.decode(line)
         except ValueError, e:
             print repr(e)
             line = self.inputfile.readline()
             continue
         if tweet.has_key("delete") or tweet.has_key("scrub_geo") or tweet.has_key("limit"):
             print "unimplemented data item"
         else:
             #print tweet["text"]
             text = tweet["text"]
             tweet_w = time.strptime(tweet["created_at"], "%a %b %d %H:%M:%S +0000 %Y")
             tokens = tokenizer.tokenize(text)
             if tweet.has_key("coordinates"):
                 coord = tweet["coordinates"]
                 if coord == None:
                     print "coordinates null"
                 elif coord.has_key("type") and coord["type"] == "Point":
                     coords_times_list.append([coord["coordinates"], tweet_w])
                 else:
                     print "not a point"
         line = self.inputfile.readline()
开发者ID:cgl,项目名称:sna-tools,代码行数:30,代码来源:stats.py


示例10: indexDir

def indexDir(dirname):
  basename = os.path.basename(dirname.rstrip("/"))
  indexFile = open('./indexes/%s_index' % basename, 'w');

  idMap = {}
  indexDict = {}
  docId = 0
  for (root, dirnames, filenames) in os.walk(dirname):
    for filename in filenames:
      if (re.search("\.sw[op]$", filename) == None):
        with open(os.path.join(root, filename), 'r') as fh:
          idMap[docId] = filename

          tokens = tokenize(fh)
          for (pos, token) in tokens:
            token = stem(alias(token))
            try:
              positionMap = indexDict[token]
              try:
                positionMap[docId].append(pos)
              except KeyError:
                positionMap[docId] = [pos]
            except KeyError:
              indexDict[token] = {docId: [pos]}

          docId += 1

  fullIndex = {"id_map" : idMap, "index" : indexDict}
  indexFile.write(json.dumps(fullIndex))
  indexFile.close()
开发者ID:fiono,项目名称:spotless_mind,代码行数:30,代码来源:indexer.py


示例11: __init__

	def __init__(self, source):
		self.numTemps = 0
		self.macros = []
		self.mlMacros = []
		for mem in dir(self):
			mem = getattr(self, mem)
			if isinstance(mem, type) and issubclass(mem, Macro):
				if issubclass(mem, MLMacro):
					self.mlMacros.append(mem(self))
				else:
					self.macros.append(mem(self))
		self.macros.sort()
		self.mlMacros.sort()
		
		tokens = tokenizer.tokenize(source)
		pprint.pprint(tokens)
		
		code = self.compile(tokens)
		pprint.pprint(code)
		
		code = Module(
				None,
				Stmt(code)
			)
		
		set_filename('<macropy>', code)
		self.compiled = ModuleCodeGenerator(code).getCode()
开发者ID:GrimDerp,项目名称:metapy,代码行数:27,代码来源:mcompiler.py


示例12: test_lt_gt

 def test_lt_gt(self):
     test_str = "(and (< 3 5) true)"
     actual_tokens = tokenize(test_str)
     consumed, remaining = S(actual_tokens)
     code = generate_code(consumed)
     print test_str
     print code
开发者ID:fridgei,项目名称:CS480,代码行数:7,代码来源:unit_tests.py


示例13: test_math1

 def test_math1(self):
     test_str = "(+ (- 234.3 1.1) 23)"
     actual_tokens = tokenize(test_str)
     consumed, remaining = S(actual_tokens)
     code = generate_code(consumed)
     print test_str
     print code
开发者ID:fridgei,项目名称:CS480,代码行数:7,代码来源:unit_tests.py


示例14: getScore

def getScore(newtitle):
    query = tokenizer.tokenize(newtitle)
    res = idx.queryVector(query, 1)
    #print("{0} results.".format(len(res)))
    
    # Take average of (upvotes-downvotes) weighted by similarity score ^ 2
    # but only for posts with simscore > max(simscore)/2
    totalweight = 0.0
    totalscore = 0.0
    for n in res:
        simscore = n[1]
        simscore = simscore ** 0.5
        #if simscore < 0.75:
        #    continue

        post = postdata.posts[n[0]]
        #score = post["day"][1] - post["day"][2] # ups - downs^2
        #score = post["day"][1]
        #score = post["day"][1] + post["num_comments"]
        score = post["day"][1] - post["day"][2] + post["num_comments"]*2.5

        totalscore += float(score) * simscore
        totalweight += simscore

        #return float(score) # test

    if totalweight == 0:
        return 0.0 # couldn't make a score for this

    finalscore = (totalscore / totalweight)

    return finalscore
开发者ID:keitase,项目名称:EECS498-term-project,代码行数:32,代码来源:query.py


示例15: test_math2

 def test_math2(self):
     test_str = "(* 34 (- 23 45))"
     actual_tokens = tokenize(test_str)
     consumed, remaining = S(actual_tokens)
     code = generate_code(consumed)
     print test_str
     print code
开发者ID:fridgei,项目名称:CS480,代码行数:7,代码来源:unit_tests.py


示例16: test_while

 def test_while(self):
     test_str = "(while (< 3 i) (assign i (+ i 1)))"
     expected_tokens = [Token("(", "L_PAREN"),
                        Token("while", "E_WHILE"),
                        Token("(", "L_PAREN"),
                        Token("<", "O_LT"),
                        Token("3", "V_INT"),
                        Token("i", "V_STRING"),
                        Token(")", "R_PAREN"),
                        Token("(", "L_PAREN"),
                        Token("assign", "E_ASSIGN"),
                        Token("i", "V_STRING"),
                        Token("(", "L_PAREN"),
                        Token("+", "O_ADD"),
                        Token("i", "V_STRING"),
                        Token("1", "V_INT"),
                        Token(")", "R_PAREN"),
                        Token(")", "R_PAREN"),
                        Token(")", "R_PAREN")]
     actual_tokens = tokenize(test_str)
     for actual, expected in izip(actual_tokens, expected_tokens):
         self.assertEqual(actual, expected)
     consumed, remaining = S(actual_tokens)
     self.assertEqual(remaining, [])
     if not remaining:
         print "accepted"
         print consumed
开发者ID:fridgei,项目名称:CS480,代码行数:27,代码来源:tests.py


示例17: guess

    def guess(self, text):
        doc_counts = {}
        doc_inverse_counts = {}
        tokens = tokenize(text)
        scores = {}
        for label in self.labels:
            doc_counts[label] = self.doc_count(label)
            doc_inverse_counts[label] = self.doc_inverse_count(label)
            total = self.total_doc_count()
        for label in self.labels:
            logSum = 0.0
            for word in tokens:
                stem_total_count = self.stem_total_count(word)
                if stem_total_count == 0.0:
                    continue
                else:
                    word_prob = self.stem_label_count(label, word) / doc_counts[label]
                    word_inverse_prob = self.stem_inverse_label_count(label, word) / doc_inverse_counts[label]
                    wordicity = word_prob / (word_prob + word_inverse_prob)

                    wordicity = (( 1.0 * 0.5) + (stem_total_count * wordicity) ) / (1.0 + stem_total_count )
                    if wordicity == 0.0:
                        wordicity = 0.01
                    elif wordicity == 1:
                        wordicity = 0.99
                try:
                    logSum += math.log(1.0 - wordicity) - math.log(wordicity)
                except ValueError:
                    print "ValueError"
            try:
                scores[label] = 1.0 / (1.0 + math.exp(logSum))
            except OverflowError:
                print "OverflowError"
        return scores
开发者ID:Radahika,项目名称:Persimmon,代码行数:34,代码来源:train.py


示例18: test

    def test(self, path):
        corp = Corpus(path)
        bs = Bayesian()
        count = 0
        sender_bl = load_pickle('sender_bl.pickle')
        # scan email and define if msg is SPAM or HAM
        # first check if sender occurs in sender Blacklist
        # then count spamicity of the word using the Bayes approach
        for fname, body in corp.emails():
            sender = find_sender(body)
            if sender in sender_bl:
                self.tag_it(path, fname, 'SPAM')
                continue

            spamicity_list = []
            count += 1
            tokens = tokenize(body)
            # compute spamicity for each word and create list of the values
            for el in tokens:
                word_spamicity = [el, bs.word_spamicity(el)]
                spamicity_list.append(word_spamicity)
            # prepare list for Bayes
            spamicity_list = [list(i) for i in set(map(tuple, spamicity_list))]  # remove duplicates from list
            spamicity_list.sort(key=lambda x: abs(0.5 - x[1]), reverse=True)
            prediction = bs.bayes_pred(spamicity_list[:15])  # Consider only 15 'words'
            if prediction > 0.9 or sender in sender_bl:
                self.tag_it(path, fname, 'SPAM')
            else:
                self.tag_it(path, fname, 'OK')
开发者ID:unacau,项目名称:bayesian-spam-filtering,代码行数:29,代码来源:filter.py


示例19: compute_ave_words_in_sentence

	def compute_ave_words_in_sentence(self):
		sentences = tokenizer.split_sentence(self.text)
		average = 0
		for sentence in sentences:
			average += len(tokenizer.tokenize(sentence))
		self.ave_words_in_sentence = 1.0 * average / len(sentences)
		return self.ave_words_in_sentence
开发者ID:haknsahn,项目名称:authorRecognizer,代码行数:7,代码来源:document.py


示例20: _parse

 def _parse(self, path, content, addWords):
    words = tokenizer.tokenize(path, content)
    wordList = []
    currNode = ParseNode(path, 0, None)
    currLine = [0, currNode]
    nodeId = 1
    for token, start, type in words:
       if type == tokenizer.NOTHING:
          if addWords:
             self.words.add(token)
          wordList.append((token, start, currLine))
       elif type == tokenizer.NEWLINE:
          wordList.append(('\\n', start, currLine))
          prevLine = currLine
          currLine = [currLine[0]+1, currNode]
       elif type == tokenizer.DEDENT:
          wordList.append(('\\d', start, currLine))
          currNode = currNode.parent
          currLine[1] = currNode
       elif type == tokenizer.INDENT:
          wordList.append(('\\i', start, currLine))
          currNode = ParseNode(path, nodeId, currNode)
          nodeId += 1
          prevLine[1] = currNode
          currLine[1] = currNode
    if len(wordList) == 0:
       wordList.append(('\\n', 0, currLine))
    return wordList
开发者ID:pokey,项目名称:smartAutocomplete,代码行数:28,代码来源:Classifier.py



注:本文中的tokenizer.tokenize函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python tokenizer.Tokenizer类代码示例发布时间:2022-05-27
下一篇:
Python tokenprog.match函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap