• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

Python tokenizer.Tokenizer类代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中tokenizer.Tokenizer的典型用法代码示例。如果您正苦于以下问题:Python Tokenizer类的具体用法?Python Tokenizer怎么用?Python Tokenizer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。



在下文中一共展示了Tokenizer类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: main

def main():
    ## args
    parser = argparse.ArgumentParser()
    parser.add_argument('-r', '--reviews', required=True, help='Review data file')
    parser.add_argument('-o', '--out', required=True, help='Inverted index output file')
    parser.add_argument('-s', '--stop', required=True, help='Stopword list')
    opts = parser.parse_args()

    ## Output file
    csv_writer = csv.writer(open(opts.out, 'w'), delimiter="\t")
    csv_writer.writerow(['token', 'business_id', 'review_id', 'position', '...'])

    ## Tokenizer
    tk = Tokenizer(opts.stop)
    token_map = defaultdict(list)

    ## Tokenize review texts
    # for each word in the vocabulary (in this case all words found in all reviews):
    # business id, review id, and position of each term occurrence
    # instead of using the review id, uses the line on which the review occurs as a unique identifier
    reviews = open(opts.reviews)
    for review_num, line in enumerate(reviews):
        review = json.loads(line)
        business_id = review['business_id'].encode('utf-8')
        tokens = tk.tokenize(review['text'])
        for position, word in enumerate(tokens):
            token_map[word].append((business_id, review_num, position))

    ## Print sorted inverted index
    for token in sorted(token_map):
        row = [token]
        row.extend(token_map[token])
        csv_writer.writerow(row)
开发者ID:jschear,项目名称:cs1951a-final,代码行数:33,代码来源:create_index.py


示例2: ext_json

def ext_json():
    rdfUrl = ''
    tok = Tokenizer()
    if request.method == 'POST':
        rdf = request.form['data']
        status_test = "0"#request.form['status']
        filters = ""#request.form['exculdeurls']
        #rdf = "http://jpp.no-ip.org/MAD_J.rdf"
        try:
            #r = requests.get(rdf)
            gg = Graph()
            #g.load(rdfUrl)
            rdf_content = StringIO.StringIO(rdf.encode('utf-8'))
            #print rdf_content.readline()
            gg.parse(rdf_content,  format="xml")
            ext = Extractor(gg)
            uris = ext.getUris()
            mapping = MapFactory()
            for uri in uris:
                term = tok.tokenized_url(uri)
                uri_status = ""
                if status_test == "1":
                    uri_status = ext.testUri(uri)
                else:
                    uri_status = "N/A"  
                uri_lookup = str(uri)+"\"" 
                lnum = ext.get_lines(rdf_content, uri_lookup)          
                ent = MapEntry(uri, term, "", lnum, uri_status)
                mapping.add(ent)
            jsonized_result = json.dumps(mapping.get())              
            return Response(jsonized_result, mimetype='application/json')
        except requests.exceptions.ConnectionError:
            X2Rwarning = 'X2R Warning: The requested URL raises ConnectionError~!!!'
            return X2Rwarning
开发者ID:FengPu,项目名称:x2r-me,代码行数:34,代码来源:x2r-me.py


示例3: main

def main(args):
    try:
        (opts, args) = getopt(args, "o:TPX")
    except GetoptError:
        usage()
    if len(args) != 1:
        usage()

    from tokenizer import Tokenizer
    from parser import Parser
    from error import JtError
    import context
    from os.path import abspath

    filename = abspath(args[0])
    stdin = file(filename, "r")
    target = "P"
    stdout = sys.stdout
    for (ok, ov) in opts:
        if ok in ("-T", "-P", "-X"):
            target = ok[1]
        elif ok == "-o":
            stdout = file(ov, "w")
    contents = stdin.read()
    tokenizer = Tokenizer()
    tokenizer.build()
    tokenizer.input(contents)
    parser = Parser(tokenizer)
    result_tree = None
    try:
        result_tree = parser.parse()
    except JtError, error:
        failure(error)
开发者ID:jwilk,项目名称:jtc,代码行数:33,代码来源:cli.py


示例4: execute

 def execute(self):
     if len(self.proj_paths) > 0:
         logging.info('Starting tokenizer. Producibles (logs, output, etc) can be found under the name '+self.target_folders)
         tokenizer = Tokenizer(self.proj_paths, self.DB_user, self.DB_pass, self.DB_name, logging, self.logs_folder, self.output_folder, self.N_PROCESSES, self.BATCH_SIZE, self.PROJECTS_CONFIGURATION)
         tokenizer.execute()
     else:
         logging.warning('The list of new projects is empty (or these are already on the DB).')
开发者ID:Mondego,项目名称:SourcererCC,代码行数:7,代码来源:tokenizerController.py


示例5: _tokenize_tweet

 def _tokenize_tweet(self, tweet):
     """
     Input: tweet (String)
     Output: List of tokens
     """
     tok = Tokenizer(preserve_case=False)
     return tok.tokenize(tweet)
开发者ID:Chouffe,项目名称:senti-tweet,代码行数:7,代码来源:tools.py


示例6: tokenize

    def tokenize(self, **kwargs):
        """
        Returns the tokenized string using a parser.
        """

        string_tokenizer = Tokenizer()

        return string_tokenizer.tokenize(kwargs.get("text"), kwargs.get("parser"))
开发者ID:DarkmatterVale,项目名称:regex4dummies,代码行数:8,代码来源:toolkit.py


示例7: interpret_line

    def interpret_line(self, line):
        tokenizer = Tokenizer()
        tokenizer.parse(line)

        first_token = tokenizer.getNextToken()
        if first_token.type == Token.NUMBER:
            self.lines[int(first_token.value)] = tokenizer.prog[tokenizer.pos:]
            self.sort_lines()
        else:
            self.run_line(line)
开发者ID:tonyedgecombe,项目名称:pytinybasic,代码行数:10,代码来源:interpreter.py


示例8: main

def main():
    tok = Tokenizer()
    mapping = MapFactory()
    uris = ["http://abc.ee.ntu/alf_123", "http://sc.e.ncli.ABCdefGU"]
    for uri in uris:
        term = tok.tokenized_url(uri)
        ent = MapEntry(uri, term, "", "", "")
        mapping.add(ent)
    jsonized_result = json.dumps(mapping.get())   
    print jsonized_result   
开发者ID:FengPu,项目名称:x2r-me,代码行数:10,代码来源:x2r-me.py


示例9: testExecutionTreeWithItemAssignment

    def testExecutionTreeWithItemAssignment(self):

        c = ExpressionCompiler()
        tokenizer = Tokenizer()

        tokenizer.tokenize("A[B]= 1 + R")

        tokenizer.next()

        expr = c.compile(tokenizer)

        exec_tree = expr.get_execution_tree()

        print "Expression Tree %s\n" % (exec_tree)

        self.assertEqual(
            "( = ( item_assign ( literal A ) ( index ( literal B ) ) ) ( + ( literal 1.0 ) ( literal R ) ) )", exec_tree
        )

        # a little bit more complex
        tokenizer.tokenize("A[B+(C*3)+1]= 1 + R")

        tokenizer.next()

        expr = c.compile(tokenizer)

        exec_tree = expr.get_execution_tree()

        print "Expression Tree %s\n" % (exec_tree)

        self.assertEqual(
            "( = ( item_assign ( literal A ) ( index ( + ( + ( literal B ) ( * ( literal C ) ( literal 3.0 ) ) ) ( literal 1.0 ) ) ) ) ( + ( literal 1.0 ) ( literal R ) ) )",
            exec_tree,
        )
开发者ID:pombredanne,项目名称:java-balivernes,代码行数:34,代码来源:expr_compiler.py


示例10: testEvaluateFactors

    def testEvaluateFactors(self):

        c = ExpressionCompiler()

        tokenizer = Tokenizer()
        tokenizer.tokenize("7*7")
        tokenizer.next()

        expr = c.compile(tokenizer)

        result = expr.evaluate()

        print "result = %s\n" % (result)

        self.assertEqual(49.0, result)

        tokenizer.tokenize("7*7/7")
        tokenizer.next()

        expr = c.compile(tokenizer)

        result = expr.evaluate()

        print "result = %s\n" % (result)

        self.assertEqual(7.0, result)
开发者ID:pombredanne,项目名称:java-balivernes,代码行数:26,代码来源:expr_compiler.py


示例11: main

def main():
	# first read in the inverted index file
	parser = argparse.ArgumentParser()
	parser.add_argument('-index', required=True, help='Path to inverted index file')
	parser.add_argument('-business', required=False, help='Path to yelp business data json file', default="/course/cs1951a/pub/final/data/extracted/yelp_academic_dataset_business.json")
	opts = parser.parse_args()

	# Pre-processing
	f_index = open(opts.index,'r')
	print "loading index file..."
	wordsmap = {}
	# count = 0
	# for line in f_index:
	# 	count += 1
	# 	j_obj = json.load(line)
	# 	for k, v in j_obj.items():
	# 		wordsmap[k] = v
	# 	j_obj = None
	# 	if count % 100 == 0:
	# 		print count
	wordsmap = json.load(f_index)
	print "done"
	f_index.close()
	b_map = {}
	print "loading business file..."
	f_b = open(opts.business, 'r')
	line_num = 0
	for line in f_b:
		b_json = json.loads(line)
		b_map[str(line_num)]={"business_id":b_json['business_id'],"review_count":int(b_json['review_count']), "stars":float(b_json['stars'])}
		line_num += 1
	print "done"


	tokenizer = Tokenizer()
	# TODO: need to check error input  
	# Bug: c-d exit situation
	
	for line in sys.stdin:
		result = []
		line = line.strip('\n')
		if len(line)==0:
			continue
		elif line[0]=='"':
			line = line.strip('"')
			words = tokenizer.process_review(line)
			result = phrase_query(words, wordsmap)
		elif len(line.split())==1:
			words = tokenizer.process_review(line)
			result = one_word_query(words[0], wordsmap)
		else:
			words = tokenizer.process_review(line)
			result = free_text_query(words, wordsmap)
		rank_res = rank(words,result,b_map,wordsmap)
		print rank_res
开发者ID:cc26,项目名称:data-science-yelp,代码行数:55,代码来源:query_index.py


示例12: __init__

 def __init__( self, string_to_tokenize = '', prefix_chars = '-=<>!+*&|/%^', suffix_chars = '=<>&|' ):
     Tokenizer.__init__( self, string_to_tokenize )
     self.prefix     =   prefix_chars
     self.suffix     =   suffix_chars
 ### Setup JavaScriptTokenizer-specific regexen
     self.PREFIX             =   re.compile( "[%s]" % self.prefix )
     self.SUFFIX             =   re.compile( "[%s]" % self.suffix )
     self.BEGIN_IDENTIFIER   =   self.CHARACTER
     self.MULTILINE_COMMENT  =   re.compile("[\*]")
     self.END_COMMENT        =   re.compile("[/]")
     self.ESCAPE             =   re.compile("[\\\\]")
开发者ID:mikewest,项目名称:topdown,代码行数:11,代码来源:javascripttokenizer.py


示例13: correct_macro_syntax_test

def correct_macro_syntax_test():
    macro_string = """
!config {
output: pdf, html
table_of_contents: true
}"""
    tokenizer = Tokenizer(macro_string)
    for token in tokenizer:
        if token[0] == "!":
            open_brackets = tokenizer.next()
            if open_brackets != "{":
                raise DMLSyntaxError(open_brackets, "{")
开发者ID:Ed-von-Schleck,项目名称:dml,代码行数:12,代码来源:test-function-syntax.py


示例14: test_ast_opts

    def test_ast_opts(self):
        a = AST()
        t = Tokenizer()
        opts = {}
        opts['get-me'] = 'I am superman'

        a.parse(t.parse('{{ opts.get("get-me") }}'))
        c = a.traverse(opts=opts)
        self.assertEqual(c.buffer, 'I am superman')

        a.parse(t.parse('{@ if opts.get("get-me"): @}I am superman{@ end @}'))
        c = a.traverse(opts=opts)
        self.assertEqual(c.buffer, 'I am superman')
开发者ID:narupo,项目名称:cap,代码行数:13,代码来源:tests.py


示例15: __init__

	def __init__(self, _what, _who, _when, _where, _why, _how, _text):
		self.what = Tokenizer.removeNonAscii(_what).replace(".\"",". \"")
		self.who = Tokenizer.removeNonAscii(_who).replace(".\"",". \"")
		self.when = Tokenizer.removeNonAscii(_when).replace(".\"",". \"")
		self.where = Tokenizer.removeNonAscii(_where).replace(".\"",". \"")
		self.why = Tokenizer.removeNonAscii(_why).replace(".\"",". \"")
		self.how = Tokenizer.removeNonAscii(_how).replace(".\"",". \"")
		self.text = Tokenizer.removeNonAscii(_text).replace(".\"",". \"")
		self.sentences = Tokenizer.getSentences(self.text)
		self.tokenized_sentences = [Tokenizer.getTokens(sentence) for sentence in self.sentences]
开发者ID:anpandu,项目名称:5w1h_extractor,代码行数:10,代码来源:Info5W1H.py


示例16: analyze

def analyze(string):
    scanner = Tokenizer()
    list_of_tokens= scanner.tokenize(string)
    print "-------------"
    print "TOKEN LIST:"
    print list_of_tokens
    parser = QueryParser()
    print "----------------"
    print "PARSING RESULT"
    print "----------------"
    print parser.parse(list_of_tokens)

    semparser = QuerySemanticParser(parser.parse(list_of_tokens))
    semparser.parse()
开发者ID:dav009,项目名称:truthgraph,代码行数:14,代码来源:main.py


示例17: __init__

    def __init__( self, string_to_tokenize = '' ):
        Tokenizer.__init__( self, string_to_tokenize )

    ### Setup CSSTokenizer-specific regexen
### Throwing everything away after reading through the CSS spec.
### I ought be using the specified tokens, so I will.
# IDENT {ident}
# ATKEYWORD @{ident}
# STRING    {string}
# INVALID   {invalid}
# HASH  #{name}
# NUMBER    {num}
# PERCENTAGE    {num}%
# DIMENSION {num}{ident}
# URI   url\({w}{string}{w}\)
# |url\({w}([!#$%&*-~]|{nonascii}|{escape})*{w}\)
# UNICODE-RANGE U\+[0-9a-f?]{1,6}(-[0-9a-f]{1,6})?
# CDO   <!--
# CDC   -->
# ; ;
# { \{
# } \}
# ( \(
# ) \)
# [ \[
# ] \]
# S [ \t\r\n\f]+
# COMMENT   \/\*[^*]*\*+([^/*][^*]*\*+)*\/
# FUNCTION  {ident}\(
# INCLUDES  ~=
# DASHMATCH |=
# DELIM any other character not matched by the above rules, and neither a single nor a double quote
# 
# 
# ident [-]?{nmstart}{nmchar}*
# name  {nmchar}+
# nmstart   [_a-z]|{nonascii}|{escape}
# nonascii  [^\0-\177]
# unicode   \\[0-9a-f]{1,6}(\r\n|[ \n\r\t\f])?
# escape    {unicode}|\\[^\n\r\f0-9a-f]
# nmchar    [_a-z0-9-]|{nonascii}|{escape}
# num   [0-9]+|[0-9]*\.[0-9]+
# string    {string1}|{string2}
# string1   \"([^\n\r\f\\"]|\\{nl}|{escape})*\"
# string2   \'([^\n\r\f\\']|\\{nl}|{escape})*\'
# invalid   {invalid1}|{invalid2}
# invalid1  \"([^\n\r\f\\"]|\\{nl}|{escape})*
# invalid2  \'([^\n\r\f\\']|\\{nl}|{escape})*
# nl    \n|\r\n|\r|\f
# w [ \t\r\n\f]*
开发者ID:mikewest,项目名称:topdown,代码行数:50,代码来源:csstokenizer.py


示例18: testEvaluateNegation

    def testEvaluateNegation(self):

        c = ExpressionCompiler()
        tokenizer = Tokenizer()

        tokenizer.tokenize("not 0")
        tokenizer.next()

        expr = c.compile(tokenizer)

        result = expr.evaluate()

        print "result = %s\n" % (result)

        self.assertEqual(1, result)
开发者ID:pombredanne,项目名称:java-balivernes,代码行数:15,代码来源:expr_compiler.py


示例19: interpretStatement

 def interpretStatement(self):
     tokens = Tokenizer(self.IR)
     instr = tokens.next().lower()
     stmt = ""
     while tokens.peek() is not None:
         stmt += tokens.next()
     if instr[0] == 's':
         self.interpretSet(stmt)
     elif instr[0] == 'j':
         if len(instr) == 5:
             self.interpretJumpt(stmt)
         elif len(instr) == 4:
             self.interpretJump(stmt)
     elif instr[0] == 'h':
         self.halt(tokens)
开发者ID:aaronlaikh,项目名称:Projects,代码行数:15,代码来源:INTERPRETER.py


示例20: Parser

class Parser(object):
    def __init__(self, stmt):
        # We always wrap with ()'s
        self.tnz = Tokenizer('(' + stmt + ')')

    def pop(self):
        return self.tnz.pop()

    def peek(self):
        return self.tnz.peek()

    def top(self):
        return self.tnz.top()

    def parse(self, indent=0):
        indent = deepcopy(indent)
        indent += 1
        if istype(self.top(), 'Lparen'):
            self.pop()  # Open paren
            n = self.parse(indent)
            cp = self.pop()  # Close paren
            if istype(self.top(), 'Bop'):
                bopr = Node(self.pop(), indent)
                bopr.l_child = n
                bopr.r_child = self.parse(indent)
                return bopr
            else:
                return n
        if istype(self.top(), 'Term'):
            if istype(self.peek(), 'Bop'):
                t1 = Node(self.pop(), indent)
                bopr = Node(self.pop(), indent)
                bopr.l_child = t1
                if istype(self.top(), 'Term'):
                    bopr.r_child = self.parse(indent)
                elif istype(self.top(), 'Lparen'):
                    bopr.r_child = self.parse(indent)
                else:
                    raise SyntaxError("Expected Term or (")
                return bopr
            elif istype(self.peek(), 'Rparen'):
                t1 = Node(self.pop(), indent)
                return t1
            elif istype(self.peek(), 'Term'):
                t1 = Node(self.pop(), indent)
                return t1
            else:
                raise SyntaxError("Expecting term or (")
开发者ID:Ziaunys,项目名称:chili,代码行数:48,代码来源:parser.py



注:本文中的tokenizer.Tokenizer类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python util.getCountry函数代码示例发布时间:2022-05-27
下一篇:
Python tokenizer.tokenize函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap