本文整理汇总了Python中tokenizer.Tokenizer类的典型用法代码示例。如果您正苦于以下问题:Python Tokenizer类的具体用法?Python Tokenizer怎么用?Python Tokenizer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Tokenizer类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: main
def main():
## args
parser = argparse.ArgumentParser()
parser.add_argument('-r', '--reviews', required=True, help='Review data file')
parser.add_argument('-o', '--out', required=True, help='Inverted index output file')
parser.add_argument('-s', '--stop', required=True, help='Stopword list')
opts = parser.parse_args()
## Output file
csv_writer = csv.writer(open(opts.out, 'w'), delimiter="\t")
csv_writer.writerow(['token', 'business_id', 'review_id', 'position', '...'])
## Tokenizer
tk = Tokenizer(opts.stop)
token_map = defaultdict(list)
## Tokenize review texts
# for each word in the vocabulary (in this case all words found in all reviews):
# business id, review id, and position of each term occurrence
# instead of using the review id, uses the line on which the review occurs as a unique identifier
reviews = open(opts.reviews)
for review_num, line in enumerate(reviews):
review = json.loads(line)
business_id = review['business_id'].encode('utf-8')
tokens = tk.tokenize(review['text'])
for position, word in enumerate(tokens):
token_map[word].append((business_id, review_num, position))
## Print sorted inverted index
for token in sorted(token_map):
row = [token]
row.extend(token_map[token])
csv_writer.writerow(row)
开发者ID:jschear,项目名称:cs1951a-final,代码行数:33,代码来源:create_index.py
示例2: ext_json
def ext_json():
rdfUrl = ''
tok = Tokenizer()
if request.method == 'POST':
rdf = request.form['data']
status_test = "0"#request.form['status']
filters = ""#request.form['exculdeurls']
#rdf = "http://jpp.no-ip.org/MAD_J.rdf"
try:
#r = requests.get(rdf)
gg = Graph()
#g.load(rdfUrl)
rdf_content = StringIO.StringIO(rdf.encode('utf-8'))
#print rdf_content.readline()
gg.parse(rdf_content, format="xml")
ext = Extractor(gg)
uris = ext.getUris()
mapping = MapFactory()
for uri in uris:
term = tok.tokenized_url(uri)
uri_status = ""
if status_test == "1":
uri_status = ext.testUri(uri)
else:
uri_status = "N/A"
uri_lookup = str(uri)+"\""
lnum = ext.get_lines(rdf_content, uri_lookup)
ent = MapEntry(uri, term, "", lnum, uri_status)
mapping.add(ent)
jsonized_result = json.dumps(mapping.get())
return Response(jsonized_result, mimetype='application/json')
except requests.exceptions.ConnectionError:
X2Rwarning = 'X2R Warning: The requested URL raises ConnectionError~!!!'
return X2Rwarning
开发者ID:FengPu,项目名称:x2r-me,代码行数:34,代码来源:x2r-me.py
示例3: main
def main(args):
try:
(opts, args) = getopt(args, "o:TPX")
except GetoptError:
usage()
if len(args) != 1:
usage()
from tokenizer import Tokenizer
from parser import Parser
from error import JtError
import context
from os.path import abspath
filename = abspath(args[0])
stdin = file(filename, "r")
target = "P"
stdout = sys.stdout
for (ok, ov) in opts:
if ok in ("-T", "-P", "-X"):
target = ok[1]
elif ok == "-o":
stdout = file(ov, "w")
contents = stdin.read()
tokenizer = Tokenizer()
tokenizer.build()
tokenizer.input(contents)
parser = Parser(tokenizer)
result_tree = None
try:
result_tree = parser.parse()
except JtError, error:
failure(error)
开发者ID:jwilk,项目名称:jtc,代码行数:33,代码来源:cli.py
示例4: execute
def execute(self):
if len(self.proj_paths) > 0:
logging.info('Starting tokenizer. Producibles (logs, output, etc) can be found under the name '+self.target_folders)
tokenizer = Tokenizer(self.proj_paths, self.DB_user, self.DB_pass, self.DB_name, logging, self.logs_folder, self.output_folder, self.N_PROCESSES, self.BATCH_SIZE, self.PROJECTS_CONFIGURATION)
tokenizer.execute()
else:
logging.warning('The list of new projects is empty (or these are already on the DB).')
开发者ID:Mondego,项目名称:SourcererCC,代码行数:7,代码来源:tokenizerController.py
示例5: _tokenize_tweet
def _tokenize_tweet(self, tweet):
"""
Input: tweet (String)
Output: List of tokens
"""
tok = Tokenizer(preserve_case=False)
return tok.tokenize(tweet)
开发者ID:Chouffe,项目名称:senti-tweet,代码行数:7,代码来源:tools.py
示例6: tokenize
def tokenize(self, **kwargs):
"""
Returns the tokenized string using a parser.
"""
string_tokenizer = Tokenizer()
return string_tokenizer.tokenize(kwargs.get("text"), kwargs.get("parser"))
开发者ID:DarkmatterVale,项目名称:regex4dummies,代码行数:8,代码来源:toolkit.py
示例7: interpret_line
def interpret_line(self, line):
tokenizer = Tokenizer()
tokenizer.parse(line)
first_token = tokenizer.getNextToken()
if first_token.type == Token.NUMBER:
self.lines[int(first_token.value)] = tokenizer.prog[tokenizer.pos:]
self.sort_lines()
else:
self.run_line(line)
开发者ID:tonyedgecombe,项目名称:pytinybasic,代码行数:10,代码来源:interpreter.py
示例8: main
def main():
tok = Tokenizer()
mapping = MapFactory()
uris = ["http://abc.ee.ntu/alf_123", "http://sc.e.ncli.ABCdefGU"]
for uri in uris:
term = tok.tokenized_url(uri)
ent = MapEntry(uri, term, "", "", "")
mapping.add(ent)
jsonized_result = json.dumps(mapping.get())
print jsonized_result
开发者ID:FengPu,项目名称:x2r-me,代码行数:10,代码来源:x2r-me.py
示例9: testExecutionTreeWithItemAssignment
def testExecutionTreeWithItemAssignment(self):
c = ExpressionCompiler()
tokenizer = Tokenizer()
tokenizer.tokenize("A[B]= 1 + R")
tokenizer.next()
expr = c.compile(tokenizer)
exec_tree = expr.get_execution_tree()
print "Expression Tree %s\n" % (exec_tree)
self.assertEqual(
"( = ( item_assign ( literal A ) ( index ( literal B ) ) ) ( + ( literal 1.0 ) ( literal R ) ) )", exec_tree
)
# a little bit more complex
tokenizer.tokenize("A[B+(C*3)+1]= 1 + R")
tokenizer.next()
expr = c.compile(tokenizer)
exec_tree = expr.get_execution_tree()
print "Expression Tree %s\n" % (exec_tree)
self.assertEqual(
"( = ( item_assign ( literal A ) ( index ( + ( + ( literal B ) ( * ( literal C ) ( literal 3.0 ) ) ) ( literal 1.0 ) ) ) ) ( + ( literal 1.0 ) ( literal R ) ) )",
exec_tree,
)
开发者ID:pombredanne,项目名称:java-balivernes,代码行数:34,代码来源:expr_compiler.py
示例10: testEvaluateFactors
def testEvaluateFactors(self):
c = ExpressionCompiler()
tokenizer = Tokenizer()
tokenizer.tokenize("7*7")
tokenizer.next()
expr = c.compile(tokenizer)
result = expr.evaluate()
print "result = %s\n" % (result)
self.assertEqual(49.0, result)
tokenizer.tokenize("7*7/7")
tokenizer.next()
expr = c.compile(tokenizer)
result = expr.evaluate()
print "result = %s\n" % (result)
self.assertEqual(7.0, result)
开发者ID:pombredanne,项目名称:java-balivernes,代码行数:26,代码来源:expr_compiler.py
示例11: main
def main():
# first read in the inverted index file
parser = argparse.ArgumentParser()
parser.add_argument('-index', required=True, help='Path to inverted index file')
parser.add_argument('-business', required=False, help='Path to yelp business data json file', default="/course/cs1951a/pub/final/data/extracted/yelp_academic_dataset_business.json")
opts = parser.parse_args()
# Pre-processing
f_index = open(opts.index,'r')
print "loading index file..."
wordsmap = {}
# count = 0
# for line in f_index:
# count += 1
# j_obj = json.load(line)
# for k, v in j_obj.items():
# wordsmap[k] = v
# j_obj = None
# if count % 100 == 0:
# print count
wordsmap = json.load(f_index)
print "done"
f_index.close()
b_map = {}
print "loading business file..."
f_b = open(opts.business, 'r')
line_num = 0
for line in f_b:
b_json = json.loads(line)
b_map[str(line_num)]={"business_id":b_json['business_id'],"review_count":int(b_json['review_count']), "stars":float(b_json['stars'])}
line_num += 1
print "done"
tokenizer = Tokenizer()
# TODO: need to check error input
# Bug: c-d exit situation
for line in sys.stdin:
result = []
line = line.strip('\n')
if len(line)==0:
continue
elif line[0]=='"':
line = line.strip('"')
words = tokenizer.process_review(line)
result = phrase_query(words, wordsmap)
elif len(line.split())==1:
words = tokenizer.process_review(line)
result = one_word_query(words[0], wordsmap)
else:
words = tokenizer.process_review(line)
result = free_text_query(words, wordsmap)
rank_res = rank(words,result,b_map,wordsmap)
print rank_res
开发者ID:cc26,项目名称:data-science-yelp,代码行数:55,代码来源:query_index.py
示例12: __init__
def __init__( self, string_to_tokenize = '', prefix_chars = '-=<>!+*&|/%^', suffix_chars = '=<>&|' ):
Tokenizer.__init__( self, string_to_tokenize )
self.prefix = prefix_chars
self.suffix = suffix_chars
### Setup JavaScriptTokenizer-specific regexen
self.PREFIX = re.compile( "[%s]" % self.prefix )
self.SUFFIX = re.compile( "[%s]" % self.suffix )
self.BEGIN_IDENTIFIER = self.CHARACTER
self.MULTILINE_COMMENT = re.compile("[\*]")
self.END_COMMENT = re.compile("[/]")
self.ESCAPE = re.compile("[\\\\]")
开发者ID:mikewest,项目名称:topdown,代码行数:11,代码来源:javascripttokenizer.py
示例13: correct_macro_syntax_test
def correct_macro_syntax_test():
macro_string = """
!config {
output: pdf, html
table_of_contents: true
}"""
tokenizer = Tokenizer(macro_string)
for token in tokenizer:
if token[0] == "!":
open_brackets = tokenizer.next()
if open_brackets != "{":
raise DMLSyntaxError(open_brackets, "{")
开发者ID:Ed-von-Schleck,项目名称:dml,代码行数:12,代码来源:test-function-syntax.py
示例14: test_ast_opts
def test_ast_opts(self):
a = AST()
t = Tokenizer()
opts = {}
opts['get-me'] = 'I am superman'
a.parse(t.parse('{{ opts.get("get-me") }}'))
c = a.traverse(opts=opts)
self.assertEqual(c.buffer, 'I am superman')
a.parse(t.parse('{@ if opts.get("get-me"): @}I am superman{@ end @}'))
c = a.traverse(opts=opts)
self.assertEqual(c.buffer, 'I am superman')
开发者ID:narupo,项目名称:cap,代码行数:13,代码来源:tests.py
示例15: __init__
def __init__(self, _what, _who, _when, _where, _why, _how, _text):
self.what = Tokenizer.removeNonAscii(_what).replace(".\"",". \"")
self.who = Tokenizer.removeNonAscii(_who).replace(".\"",". \"")
self.when = Tokenizer.removeNonAscii(_when).replace(".\"",". \"")
self.where = Tokenizer.removeNonAscii(_where).replace(".\"",". \"")
self.why = Tokenizer.removeNonAscii(_why).replace(".\"",". \"")
self.how = Tokenizer.removeNonAscii(_how).replace(".\"",". \"")
self.text = Tokenizer.removeNonAscii(_text).replace(".\"",". \"")
self.sentences = Tokenizer.getSentences(self.text)
self.tokenized_sentences = [Tokenizer.getTokens(sentence) for sentence in self.sentences]
开发者ID:anpandu,项目名称:5w1h_extractor,代码行数:10,代码来源:Info5W1H.py
示例16: analyze
def analyze(string):
scanner = Tokenizer()
list_of_tokens= scanner.tokenize(string)
print "-------------"
print "TOKEN LIST:"
print list_of_tokens
parser = QueryParser()
print "----------------"
print "PARSING RESULT"
print "----------------"
print parser.parse(list_of_tokens)
semparser = QuerySemanticParser(parser.parse(list_of_tokens))
semparser.parse()
开发者ID:dav009,项目名称:truthgraph,代码行数:14,代码来源:main.py
示例17: __init__
def __init__( self, string_to_tokenize = '' ):
Tokenizer.__init__( self, string_to_tokenize )
### Setup CSSTokenizer-specific regexen
### Throwing everything away after reading through the CSS spec.
### I ought be using the specified tokens, so I will.
# IDENT {ident}
# ATKEYWORD @{ident}
# STRING {string}
# INVALID {invalid}
# HASH #{name}
# NUMBER {num}
# PERCENTAGE {num}%
# DIMENSION {num}{ident}
# URI url\({w}{string}{w}\)
# |url\({w}([!#$%&*-~]|{nonascii}|{escape})*{w}\)
# UNICODE-RANGE U\+[0-9a-f?]{1,6}(-[0-9a-f]{1,6})?
# CDO <!--
# CDC -->
# ; ;
# { \{
# } \}
# ( \(
# ) \)
# [ \[
# ] \]
# S [ \t\r\n\f]+
# COMMENT \/\*[^*]*\*+([^/*][^*]*\*+)*\/
# FUNCTION {ident}\(
# INCLUDES ~=
# DASHMATCH |=
# DELIM any other character not matched by the above rules, and neither a single nor a double quote
#
#
# ident [-]?{nmstart}{nmchar}*
# name {nmchar}+
# nmstart [_a-z]|{nonascii}|{escape}
# nonascii [^\0-\177]
# unicode \\[0-9a-f]{1,6}(\r\n|[ \n\r\t\f])?
# escape {unicode}|\\[^\n\r\f0-9a-f]
# nmchar [_a-z0-9-]|{nonascii}|{escape}
# num [0-9]+|[0-9]*\.[0-9]+
# string {string1}|{string2}
# string1 \"([^\n\r\f\\"]|\\{nl}|{escape})*\"
# string2 \'([^\n\r\f\\']|\\{nl}|{escape})*\'
# invalid {invalid1}|{invalid2}
# invalid1 \"([^\n\r\f\\"]|\\{nl}|{escape})*
# invalid2 \'([^\n\r\f\\']|\\{nl}|{escape})*
# nl \n|\r\n|\r|\f
# w [ \t\r\n\f]*
开发者ID:mikewest,项目名称:topdown,代码行数:50,代码来源:csstokenizer.py
示例18: testEvaluateNegation
def testEvaluateNegation(self):
c = ExpressionCompiler()
tokenizer = Tokenizer()
tokenizer.tokenize("not 0")
tokenizer.next()
expr = c.compile(tokenizer)
result = expr.evaluate()
print "result = %s\n" % (result)
self.assertEqual(1, result)
开发者ID:pombredanne,项目名称:java-balivernes,代码行数:15,代码来源:expr_compiler.py
示例19: interpretStatement
def interpretStatement(self):
tokens = Tokenizer(self.IR)
instr = tokens.next().lower()
stmt = ""
while tokens.peek() is not None:
stmt += tokens.next()
if instr[0] == 's':
self.interpretSet(stmt)
elif instr[0] == 'j':
if len(instr) == 5:
self.interpretJumpt(stmt)
elif len(instr) == 4:
self.interpretJump(stmt)
elif instr[0] == 'h':
self.halt(tokens)
开发者ID:aaronlaikh,项目名称:Projects,代码行数:15,代码来源:INTERPRETER.py
示例20: Parser
class Parser(object):
def __init__(self, stmt):
# We always wrap with ()'s
self.tnz = Tokenizer('(' + stmt + ')')
def pop(self):
return self.tnz.pop()
def peek(self):
return self.tnz.peek()
def top(self):
return self.tnz.top()
def parse(self, indent=0):
indent = deepcopy(indent)
indent += 1
if istype(self.top(), 'Lparen'):
self.pop() # Open paren
n = self.parse(indent)
cp = self.pop() # Close paren
if istype(self.top(), 'Bop'):
bopr = Node(self.pop(), indent)
bopr.l_child = n
bopr.r_child = self.parse(indent)
return bopr
else:
return n
if istype(self.top(), 'Term'):
if istype(self.peek(), 'Bop'):
t1 = Node(self.pop(), indent)
bopr = Node(self.pop(), indent)
bopr.l_child = t1
if istype(self.top(), 'Term'):
bopr.r_child = self.parse(indent)
elif istype(self.top(), 'Lparen'):
bopr.r_child = self.parse(indent)
else:
raise SyntaxError("Expected Term or (")
return bopr
elif istype(self.peek(), 'Rparen'):
t1 = Node(self.pop(), indent)
return t1
elif istype(self.peek(), 'Term'):
t1 = Node(self.pop(), indent)
return t1
else:
raise SyntaxError("Expecting term or (")
开发者ID:Ziaunys,项目名称:chili,代码行数:48,代码来源:parser.py
注:本文中的tokenizer.Tokenizer类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论