本文整理汇总了Python中twokenize.tokenize函数的典型用法代码示例。如果您正苦于以下问题:Python tokenize函数的具体用法?Python tokenize怎么用?Python tokenize使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了tokenize函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: Extract
def Extract(self, text):
features = []
words = twokenize.tokenize(text)
#hand-crafted features
iCapitalized = True
nCapitalized = 0.1
nAllCaps = 0.1
nCapLowerViolated = 0.1
nCapUpperViolated = 0.1
nWords = 0.1
for i in range(len(words)):
capitalized = re.search(r'^([A-Z]|[a-z][A-Z])', words[i])
if capitalized and not (i == 0 or re.match(r"\.|\?|!|@.+|http:.+|:|\"", words[i-1])):
nCapitalized += 1.0
if not (i == 0 or re.match(r"\.|\?|!|@.+|http:.+|:|\"", words[i-1])):
if capitalized and self.capDict.get(words[i].lower(), '1') != '1':
nCapUpperViolated += 1.0
features.append(self.fVocab.GetID('upperViolated=%s' % words[i].lower()))
elif not capitalized and re.match(r'[a-z]+', words[i]) and self.capDict.get(words[i].lower(), '1') != '0':
nCapLowerViolated += 1.0
#features.append(self.fVocab.GetID('lowerViolated=%s' % words[i].lower()))
if re.match(r'\w+', words[i][0:1]):
nWords += 1
if re.match(r"i|i'm|im|u", words[i]):
iCapitalized = False
if re.match(r"[A-Z]{2,}", words[i]):
nAllCaps += 1
features.append(self.fVocab.GetID('iCapitalized=%s' % iCapitalized))
return ' '.join(["%s:1" % x for x in features]) + " %s:%s" % (self.fVocab.GetID('nAllCaps'), nAllCaps/nWords) + " %s:%s" % (self.fVocab.GetID('nCapitalized'), nCapitalized/nWords) + " %s:%s" % (self.fVocab.GetID('nCapLowerViolated'), nCapLowerViolated/nWords) + " %s:%s" % (self.fVocab.GetID('nCapUpperViolated'), nCapUpperViolated/nWords)
开发者ID:52nlp,项目名称:twitter_nlp,代码行数:34,代码来源:cap_classifier.py
示例2: kshinglize
def kshinglize(s, k=KSHINGLES, stopwords=STOPWORDS):
""" Tokenizes string s, removes stopwords, and returns a set of k-shingles
"""
s = s.strip().lower()
tokens_raw = twokenize.tokenize(s)
tokens = filterstopwords(tokens_raw, stopwords)
return tokens_to_kshingles(tokens, k)
开发者ID:driscoll,项目名称:cluster,代码行数:7,代码来源:cluster.py
示例3: main
def main(argv):
tagger = PerceptronTagger()
tagset = None
tokens = tokenize(line)
tags = nltk.tag._pos_tag(tokens, tagset, tagger)
format_tagged(tags)
开发者ID:h4x0rsz,项目名称:senior-design,代码行数:7,代码来源:tagAndLabel.py
示例4: learn_terms
def learn_terms(self, tweets_file_object, learn_lemmas=True, cache_size=1000000):
reader = csv.reader(tweets_file_object, delimiter=",", quotechar="\"")
term_freq = Counter()
term_id_map = dict()
tweet_vectors = []
for row in reader:
tweet_id = int(row[0])
tweet_text = row[-1]
terms = [t.lower().encode("utf-8") for t in twokenize.tokenize(tweet_text)]
if learn_lemmas:
terms = [self.lmtz.lemmatize(term) for term in terms]
tweet_sp_vector = []
counted_ids = []
for term in terms:
if term not in term_id_map:
term_id = len(term_id_map)
term_id_map[term] = term_id
else:
term_id = term_id_map[term]
if term_id not in counted_ids:
term_freq[term_id] += 1
counted_ids.append(term_id)
tweet_sp_vector.append(term_id)
tweet_vectors.append((tweet_id, tweet_sp_vector))
if len(tweet_vectors) >= cache_size:
self.write_tweet_vectors(tweet_vectors)
tweet_vectors = []
self.write_tweet_vectors(tweet_vectors)
self.write_terms(term_id_map, term_freq)
开发者ID:zaycev,项目名称:n7,代码行数:29,代码来源:search.py
示例5: preprocess
def preprocess(m, sep_emoji=False):
m = m.lower()
m = max_reps(m)
#replace user mentions with token '@user'
user_regex = r"[email protected]+?( |$)|<@mention>"
m = re.sub(user_regex," @user ", m, flags=re.I)
#replace urls with token 'url'
m = re.sub(twokenize.url," url ", m, flags=re.I)
tokenized_msg = ' '.join(twokenize.tokenize(m)).strip()
if sep_emoji:
#tokenize emoji, this tokenzier however has a problem where repeated punctuation gets separated e.g. "blah blah!!!"" -> ['blah','blah','!!!'], instead of ['blah','blah','!','!','!']
m_toks = tokenized_msg.split()
n_toks = twk.tokenize(tokenized_msg)
if len(n_toks)!=len(m_toks):
#check if there is any punctuation in this string
has_punct = map(lambda x:x in twk.punctuation, n_toks)
if any(has_punct):
new_m = n_toks[0]
for i in xrange(1,len(n_toks)):
#while the same punctuation token shows up, concatenate
if has_punct[i] and has_punct[i-1] and (n_toks[i] == n_toks[i-1]):
new_m += n_toks[i]
else:
#otherwise add space
new_m += " "+n_toks[i]
tokenized_msg = new_m
return tokenized_msg.lstrip()
开发者ID:samiroid,项目名称:utils,代码行数:27,代码来源:__init__.py
示例6: main
def main(argv):
if len(sys.argv) != 3:
print("Usage:> python getTaggedFile.py infile.txt outfile.txt")
exit()
infile_name = str(sys.argv[1])
outfile_name = str(sys.argv[2])
infile = open(infile_name, 'r')
outfile = open(outfile_name, 'w')
tagger = PerceptronTagger()
print("Reading file...")
line = infile.readline()
while line != '':
# Use Twokenizer for twitter parser
tagset = None
tokens = tokenize(line)
tags = nltk.tag._pos_tag(tokens, tagset, tagger)
outfile.write(format_tagged(tags))
line = infile.readline()
# close file and connection
infile.close()
outfile.close()
print("Finished tagging... Closing files.")
开发者ID:h4x0rsz,项目名称:senior-design,代码行数:29,代码来源:getTaggedFile.py
示例7: __init__
def __init__(self, testData):
self.labeledTweets = []
for line in open(testData):
line = line.rstrip('\n')
fields = line.split('\t')
fields[6] = ' '.join(twokenize.tokenize(fields[6]))
self.labeledTweets.append(fields)
开发者ID:52nlp,项目名称:twitter_nlp,代码行数:7,代码来源:cap_eval.py
示例8: process
def process(self,text):
tTweet = ""
for word in text.split():
if "#" in word:
word = word.replace("#"," ")
f=0
for tt in self.remove:
if tt in word:
f=1
if f==1:
continue
tTweet = " ".join([tTweet,word])
tTweet = tTweet.strip()
tempTweet = ""
for word in twokenize.tokenize(tTweet):
if word != " " and word not in self.stop and not word.isdigit():
word = word.strip().lower()
if len(word) > 26:
word=word[:27]
#### Normalize Emoticons
try:
word = self.emoticons[word]
except:
#Normalize Acronyms
try:
try:
if self.wordDict[word] ==1:
word = word
except:
word = self.acronyms[word]
except:
#Normalize Contractions
try:
word = self.contractions[word]
except:
#Normalize words (Spell)
try:
if self.wordDict[word] == 1:
word = word
except:
CW = self.correct(word)
if "@" in word or "#" in word:
word = word
else:
if CW != "a":
word = CW
if "@" in word:
word="@user"
tempTweet = " ".join([tempTweet,word.strip()])
tempTweet = tempTweet.lower().strip()
tempTweet = " ".join(stemmer.stem(w) for w in tempTweet.split(" ") if w not in self.stop)
#print(tempTweet.encode("utf-8"))
return(tempTweet)
##Usage
# pre = Preprocess()
# pre.process("lol god pls help with my hw :) :(:D")
开发者ID:suddu16,项目名称:Youtube-Comedy-Comparison,代码行数:59,代码来源:PreprocessClass.py
示例9: process_line
def process_line(s, clean_string=True):
if clean_string:
s = clean_str(s)
tokens = tokenize(s)
#return [process_token(None,token).lower() for token in tokens]
sent = nltk.pos_tag(tokens)
chunks = nltk.ne_chunk(sent, binary=False)
return [process_token(c,token).lower().encode('UTF-8') for c,token in map(None, chunks, tokens)]
开发者ID:npow,项目名称:Ubuntu-Dialogue-Generationv2,代码行数:8,代码来源:createDictionaries.py
示例10: all_tokens
def all_tokens(tweetreader):
i = 0
for r in tweetreader:
i += 1
tokens = tokenize(r[-1])
for t in tokens:
yield t
if i >= 50000:
return
开发者ID:zaycev,项目名称:n7,代码行数:9,代码来源:pmi.py
示例11: process_line
def process_line(s, clean_string=True):
"""
Processes a line by iteratively calling process_token.
"""
if clean_string:
s = clean_str(s)
tokens = tokenize(s)
sent = nltk.pos_tag(tokens)
chunks = nltk.ne_chunk(sent, binary=False)
return [process_token(c,token).lower().encode('UTF-8') for c,token in map(None, chunks, tokens)]
开发者ID:npow,项目名称:Ubuntu-Dialogue-Generationv2,代码行数:10,代码来源:find_testfiles.py
示例12: get_idx_from_sent
def get_idx_from_sent(sent, word_idx_map, k):
"""
Transforms sentence into a list of indices. Pad with zeroes.
"""
x = []
words = tokenize(sent)
for word in words:
if word in word_idx_map:
x.append(word_idx_map[word])
else:
x.append(word_idx_map[UNK_TOKEN])
return x
开发者ID:BinbinBian,项目名称:ubottu,代码行数:12,代码来源:merge_data.py
示例13: process_statuses
def process_statuses(self, statuses):
statuses = [twokenize.tokenize(s.text.lower()) for s in statuses]
for s in xrange(len(statuses)):
w = 1
while True:
if w >= len(statuses[s]):
break
if statuses[s][w][0] == "'":
statuses[s] = statuses[s][:w-1] + [statuses[s][w-1] + statuses[s][w]] + statuses[s][w+1:]
w = 0
w += 1
return statuses
开发者ID:goddardc,项目名称:nlp-twitter,代码行数:12,代码来源:main.py
示例14: tokenize_and_clean
def tokenize_and_clean(msg, alignments):
if alignments:
toks = twokenize.tokenize(msg)
else:
toks = twokenize.simple_tokenize(msg)
for i in range(len(toks)):
toks[i] = toks[i].lower()
inds = range(len(toks))
#if len(inds) < len(toks): print "dropping junk", sorted(list(toks[i] for i in (set(range(len(toks)))-set(inds))))
if alignments:
return toks.subset(inds)
else:
return [toks[i] for i in inds]
开发者ID:AnnuSachan,项目名称:tweetmotif,代码行数:13,代码来源:bigrams.py
示例15: normalize_tweet
def normalize_tweet(text, lowercase=False, rm_digits=False, return_tokens=False):
if lowercase:
text = text.lower()
text = re.sub(URL_PATTERN, 'URL', text)
tokens = twokenize.tokenize(text)
if return_tokens:
if rm_digits:
tokens = map(lambda tk: re.sub(NUM_PATTERN, 'NUM', tokens))
return tokens
clean = ' '.join(tokens)
if rm_digits:
re.sub(NUM_PATTERN, 'NUM', clean)
return clean
开发者ID:imgemp,项目名称:semeval16,代码行数:13,代码来源:__init__.py
示例16: preprocess
def preprocess(tweet):
abbv_dict = json.load(open("../other/abbreviations.json"))
emo_lexica_dict = json.load(open("../other/emotions.json"))
for emoticon in emo_lexica_dict[u'emoticons']:
abbv_dict[emoticon] = ' '
for word in emo_lexica_dict[u'words']:
abbv_dict[word] = ' '
hash_transformer = Transformer.HashtagTransformer()
sub_transformer = Transformer.SubstitutionTransformer(abbv_dict)
preprocessor = Preprocessor([hash_transformer, sub_transformer])
tweet = ' '.join(tokenize(tweet))
tweet = preprocessor.transform(tweet)
return tweet
开发者ID:i-DAT,项目名称:emotionannotate,代码行数:13,代码来源:Preprocessor.py
示例17: process_line
def process_line(s, clean_string=True, enable_tags = False):
"""
Processes a line by iteratively calling process_token.
"""
if clean_string:
s = clean_str(s)
tokens = tokenize(s)
if enable_tags:
sent = nltk.pos_tag(tokens)
chunks = nltk.ne_chunk(sent, binary=False)
words = []
for chunk in chunks:
words += process_chunk(chunk)
return [w.lower().encode('UTF-8') for w in words]
else:
return [process_token(token).lower().encode('UTF-8') for token in tokens]
开发者ID:pl8787,项目名称:UbuntuDataGenerator,代码行数:16,代码来源:TextPreprocess.py
示例18: parse_tweets
def parse_tweets(tweets):
parsed_tweets =[]
for tweet_json in tweets:
try:
#tweet_json = json.loads(tweet_str);
tweet_text = tweet_json['text'];
if u'RT' in tweet_text:
tweet_text = tweet_text[0:tweet_text.index(u'RT') -1]
tweet_token = tk.tokenize(tweet_text)
tweet_token =[char_reduction(tok) for tok in tweet_token]
tweet_token = [t for tok in tweet_token for t in es.expand(tok) if (not (('@' in t) or (tk.Url_RE.search(t)) or (not emo.Emoticon_RE.search(t) and tk.Punct_re.search(t))))]
if tweet_token != []:
tweet_obj = {"token":tweet_token,"location" : tweet_json['place']['country'] if tweet_json['place'] != None else None,"json":tweet_json,"type" :""}
parsed_tweets.append(tweet_obj)
except Exception as e:
print e
return parsed_tweets
开发者ID:siddharthmodala,项目名称:twittersentiment,代码行数:20,代码来源:parse.py
示例19: read_tweets
def read_tweets(self, filename, emo):
"""Read tweets in raw format, returning a list of all tweets in the file"""
emo_tweets = []
non_emo_tweets = []
with codecs.open(filename, encoding='utf8') as tweet_file:
# tweet = []
for line in tweet_file:
data = json.loads(line)
id = data['tweetid'].strip()
text = data['text'].strip()
emotions = data['emotions']
tokens = tokenize(text)
incount = 0
for e in emotions:
if e == emo:
incount = 1
if incount == 1:
emo_tweets.append(SPACE.join(tokens))
elif incount == 0:
non_emo_tweets.append(SPACE.join(tokens))
return emo_tweets, non_emo_tweets
开发者ID:i-DAT,项目名称:emotionannotate,代码行数:21,代码来源:Preprocessor.py
示例20: __init__
def __init__(self, line):
fields = line.split('","')
if fields[0] == '"0':
self.senti = -1
elif fields[0] == '"2':
self.senti = 0
elif fields[0] == '"4':
self.senti = 1
self.id = fields[1]
self.date = fields[2]
# self.text = fields[5][1:-1]
self.text = normalization(fields[5][:-1])
tokens = tokenize(self.text)
self.tokens = tokens
tokens_postag = nltk.pos_tag(tokens)
wordnet_tag = []
for each_pair in tokens_postag:
if 'NN' in each_pair[1]:
wordnet_tag.append((each_pair[0], 'n'))
if 'JJ' in each_pair[1]:
wordnet_tag.append((each_pair[0], 'a'))
elif 'RB' in each_pair[1]:
wordnet_tag.append((each_pair[0], 'r'))
elif 'VB' in each_pair[1]:
wordnet_tag.append((each_pair[0], 'v'))
# lemmatized tokens are lemmatized and lowered
self.ltoken_tag = []
for each_pair in wordnet_tag:
lword = lemmatizer.lemmatize(each_pair[0], each_pair[1])
self.ltoken_tag.append((lword.lower(), each_pair[1]))
self.tweet_senti_score = []
for each_pair in self.ltoken_tag:
each_score = sentiextractor.get_score(each_pair)
if abs(each_score) > 0.02:
self.tweet_senti_score.append(each_score)
else:
self.tweet_senti_score.append(0)
开发者ID:alwayforver,项目名称:demoBasic,代码行数:40,代码来源:tweetAnalysis.py
注:本文中的twokenize.tokenize函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论