本文整理汇总了Python中stemming.porter2.stem函数的典型用法代码示例。如果您正苦于以下问题:Python stem函数的具体用法?Python stem怎么用?Python stem使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了stem函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: get_pmi
def get_pmi(self, word0, word1):
"""Return the pointwise mutual information, a measure of word
association within a window, for two words. This is normalized
using Bouma (2009) to avoid infinite values for OOV terms.
"""
word0 = word0.lower()
word1 = word1.lower()
if self.stemming:
word0 = porter2.stem(word0)
word1 = porter2.stem(word1)
if word0 not in self.word_counts or word1 not in self.word_counts:
return -1
if word0 < word1:
pair_counts = self.word_pair_counts[word0][word1]
else:
pair_counts = self.word_pair_counts[word0][word1]
if pair_counts == 0:
return -1
num_words = self.word_counts[anyword]
# TODO: confirm normalization. Currently assuming words are
# normalized by num_words and pairs by num_words^2.
ratio = pair_counts / (self.word_counts[word0] *
self.word_counts[word1])
pmi = np.log(ratio)
normalized_pmi = - pmi / np.log(pair_counts / (num_words * num_words))
return normalized_pmi
开发者ID:mcka1n,项目名称:dissertation,代码行数:33,代码来源:stats.py
示例2: find_collocations
def find_collocations(file_name, data, popular_word):
text_file = open(file_name, 'r')
file_content = text_file.read()
most_common_words = find_most_common_words(file_content, popular_word)
second_word = None
third_word = None
collocations = data
text_file.seek(0)
for line in text_file:
for word in line.split():
first_word = second_word
second_word = third_word
third_word = trim_word(word)
if (first_word not in most_common_words and second_word not in most_common_words) and \
(first_word and first_word[0].islower() and second_word and second_word[0].islower()):
count_collocations(collocations, stem(first_word.lower()), stem(second_word.lower()))
# dodatkowa iteracja dla ostatniego slowa
first_word = second_word
second_word = third_word
count_collocations(collocations, first_word, second_word)
collocations = find_whole_collocations_from_stems(collocations, file_content)
return collocations, most_common_words, file_content
开发者ID:Ogleiv,项目名称:IWI,代码行数:27,代码来源:collocations_file.py
示例3: calculateScore
def calculateScore(query,qID):
sfile=open('../AP_DATA/stoplist.txt','r')
sList=sfile.read().split('\n')
query=query.lower()
qList=re.findall("\w+[\.?\w+]*",query)
temp=list()
for term in qList:
if term.endswith('.') & term.count('.')==1 & (len(term)>1):
term=term.replace('.','')
if term.startswith('_') & term.count('_') ==1 & (len(term)>1):
term = term.replace('_','')
temp.append(term)
qList = temp
#print index_num
if index_num=='4':
#print 123
qList=[i for i in temp if i not in sList]
temp=list()
for term in qList:
term=stem(term)
temp.append(term)
qList=temp
if index_num=='3':
temp=list()
for term in qList:
term=stem(term)
temp.append(term)
qList=temp
if index_num=='2':
qList=[i for i in temp if i not in sList]
开发者ID:vaibhavty,项目名称:InformationRetrieval,代码行数:35,代码来源:score.py
示例4: find_collocations_tri
def find_collocations_tri(filename):
text_file = open(filename, 'r')
most_common_words = find_most_common_words(text_file, 100)
second_word = None
third_word = None
fourth_word = None
collocations = dict()
text_file.seek(0)
for line in text_file:
for word in line.split():
first_word = second_word
second_word = third_word
third_word = fourth_word
fourth_word = trim_word(word)
if (first_word not in most_common_words and second_word not in most_common_words and third_word not in most_common_words) and \
(first_word and first_word[0].islower() and second_word and second_word[0].islower() and third_word and third_word[0].islower()):
count_collocations_tri(collocations, stem(first_word.lower()), stem(second_word.lower()), stem(third_word.lower()))
#dodatkowa iteracja dla ostatniego slowa
first_word = second_word
second_word = third_word
third_word = fourth_word
count_collocations_tri(collocations, first_word, second_word, third_word)
sort_collocations_tri(collocations)
开发者ID:Ogleiv,项目名称:IWI,代码行数:27,代码来源:kolotri.py
示例5: find_collocations_penta
def find_collocations_penta(text, data, popular_word):
most_common_words = find_most_common_words(text, popular_word)
second_word = None
third_word = None
fourth_word = None
fifth_word = None
sixth_word = None
collocations = data
for word in text.split():
first_word = second_word
second_word = third_word
third_word = fourth_word
fourth_word = fifth_word
fifth_word = sixth_word
sixth_word = trim_word(word)
if (first_word not in most_common_words and second_word not in most_common_words and third_word not in most_common_words and fourth_word not in most_common_words and fifth_word not in most_common_words) and \
(first_word and first_word[0].islower() and second_word and second_word[0].islower() and third_word and third_word[0].islower() and fourth_word and fourth_word[0].islower() and fifth_word and fifth_word[0].islower() ):
count_collocations_penta(collocations, stem(first_word.lower()), stem(second_word.lower()), stem(third_word.lower()), stem(fourth_word.lower()), stem(fifth_word.lower()))
#dodatkowa iteracja dla ostatniego slowa
first_word = second_word
second_word = third_word
third_word = fourth_word
fourth_word = fifth_word
fifth_word = sixth_word
count_collocations_penta(collocations, first_word, second_word, third_word, fourth_word, fifth_word)
return collocations, most_common_words
开发者ID:Ogleiv,项目名称:IWI,代码行数:30,代码来源:collocations_wikipedia_penta.py
示例6: tokenize_porter
def tokenize_porter(title, body):
""" Break text into words and stem user porter stemmer """
# break up words & remove stopwords
title_break = stopWords(nltk.word_tokenize(title), lower_case=True)
body_break = stopWords(nltk.word_tokenize(body), lower_case=True)
# print title_break
return ["title:" + stem(title) for title in title_break] + ["body:" + stem(body) for body in body_break]
开发者ID:JasperHG90,项目名称:naiveBayes-guardian-articles,代码行数:7,代码来源:naiveBayes.py
示例7: ngram_in_collection
def ngram_in_collection(ngram, coll):
"""
Check if ngram's components are in collection
"""
s1 = set([stem(word) for word in ngram.split(' ')])
s2 = set([stem(word) for word in coll])
return (len(s1.intersection(s2)) > 0)
开发者ID:shankark10n,项目名称:ecotrends,代码行数:7,代码来源:picker.py
示例8: tokenize
def tokenize(self):
punc = """\\.!?,(){}[]"'"""
wordarray = []
for c in self.document.lower().split():
if stem(c.strip()) not in self.corpus.stopwords:
wordarray.append(stem(c.strip(punc)))
return wordarray
开发者ID:dydt,项目名称:dialectgaussmix,代码行数:7,代码来源:tokenizer.py
示例9: cleanText
def cleanText(text, entities, category):
cleanText = text
hashtags = entities.get('hashtags', [])
ranges = []
for hashtag in hashtags:
if hashtag.get('text', '').lower() == category:
indices = hashtag.get('indices')
ranges.append(indices)
urls = entities.get('urls', [])
urls.reverse()
ranges.extend([v for url in urls for k,v in url.iteritems() if k == 'indices'])
media = entities.get('media', [])
media.reverse()
ranges.extend([v for medium in media for k,v in medium.iteritems() if k == 'indices'])
ranges = sorted(ranges, key=lambda x: x[0], reverse=True)
for r in ranges:
cleanText = cleanText[:r[0]] + cleanText[r[1] + 1:]
category_stem = stem(category).lower()
cleanTextList = cleanText.split(' ')
cleanText = []
for word in cleanTextList:
if category_stem not in stem(word).lower() and stem(word).lower() not in category_stem:
cleanText.append(word)
cleanText = " ".join(cleanText)
return cleanText
开发者ID:atran3,项目名称:sarcasm_detection,代码行数:26,代码来源:cleanTweets.py
示例10: read
def read(self, publication_keyword, publication_data):
words = open(publication_keyword, 'r').readlines()
for i in range(0, self.topic_number):
s = stem(words[i].split('\t')[0])
self.topics[ s ] = dict()
self.stemword_dict[s] = words[i].split('\t')[0]
content = open(publication_data, 'r').readlines()
counter = 0
year = ''
for i in content:
# three line represents a publication
if counter % 3000 == 0:
print (counter / 3)
# record the year of this publication
if counter % 4 == 1:
year = int(i.strip())
# parse the keywords of this publication
elif counter % 4 == 3:
keywords = i.strip().split(' ')
for j in keywords:
j = stem(j)
if j in self.topics:
if year in self.topics[j]:
self.topics[j][year] += 1
else:
self.topics[j][year] = 1
counter = counter + 1
开发者ID:JingqingZ,项目名称:AminerKnowledgeGraph,代码行数:27,代码来源:time_keyword_distribution.py
示例11: makeFreqDictionaryOfSentenceWords
def makeFreqDictionaryOfSentenceWords(s1):
words1 = s1.split();
dt1 = {}
for w in words1:
if w.lower() not in stopwords:
dt1[stem(w.lower())] = dt1.get(stem(w.lower()),0) + 1
return dt1
开发者ID:farazbhinder,项目名称:article-summarization,代码行数:7,代码来源:algo2.py
示例12: sentence_matches
def sentence_matches(self, sentence_text):
"""Returns true iff the sentence contains this mention's upstream
and downstream participants, and if one of the stemmed verbs in
the sentence is the same as the stemmed action type."""
has_upstream = False
has_downstream = False
has_verb = False
# Get the first word of the action type and assume this is the verb
# (Ex. get depends for depends on)
actiontype_words = word_tokenize(self.mention.actiontype)
actiontype_verb_stemmed = stem(actiontype_words[0])
words = word_tokenize(sentence_text)
if self.string_matches_sans_whitespace(sentence_text.lower(),
self.mention.upstream.lower()):
has_upstream = True
if self.string_matches_sans_whitespace(sentence_text.lower(),
self.mention.downstream.lower()):
has_downstream = True
for word in words:
if actiontype_verb_stemmed == stem(word):
has_verb = True
return has_upstream and has_downstream and has_verb
开发者ID:johnbachman,项目名称:indra,代码行数:28,代码来源:find_full_text_sentence.py
示例13: getVocabularyStem
def getVocabularyStem(content):
vocabulary = {}
index = 0
for i in range(len(content)):
if stem(content[i]) not in vocabulary:
vocabulary[stem(content[i])] = index
index = index + 1
return vocabulary
开发者ID:yanyankangkang,项目名称:Text-Mining,代码行数:8,代码来源:feature_hy.py
示例14: main
def main():
nlp_file = open(sys.argv[1], "r")
for line in nlp_file:
words = line.strip().split(" ")
for word in words:
print stem(word)
nlp_file.close()
开发者ID:m-note,项目名称:100knock2015,代码行数:8,代码来源:knock52.py
示例15: getSentTf
def getSentTf(sent, stopwords):
doc = dict()
for word in re.split("[^a-zA-Z0-9]", sent):
word = word.lower()
if word != "" and word!="'" and stem(word) not in stopwords:
if doc.get(stem(word), 0) == 0:
doc[stem(word)] = 1
else:
doc[stem(word)] = doc[stem(word)]+1
return doc
开发者ID:imsorry1121,项目名称:paper_label,代码行数:10,代码来源:ir.py
示例16: filter
def filter(self):
# do not generate html file, just filter the correct relationships
correct_list = list()
for i in range(0, len(self.linklist)):
key0 = stem(self.linklist[i][0])
key1 = stem(self.linklist[i][1])
if self.judge(key0, key1, i) is False:
continue
correct_list.append(i)
return correct_list
开发者ID:JingqingZ,项目名称:AminerKnowledgeGraph,代码行数:10,代码来源:generate_view.py
示例17: get_word_embedding
def get_word_embedding(word, w2vmodel):
if word in w2vmodel:
return w2vmodel[word]
elif stem(word) in w2vmodel:
return w2vmodel[stem(word)]
elif word.lower() in w2vmodel:
return w2vmodel[word.lower()]
elif stem(word.lower()) in w2vmodel:
return w2vmodel[stem(word.lower())]
else:
return None
开发者ID:CLC-HCMUS,项目名称:vts,代码行数:11,代码来源:englishsum.py
示例18: naive_wc_sim
def naive_wc_sim(str1, str2):
list1 = nltk.word_tokenize(str1)
list2 = nltk.word_tokenize(str2)
count = 0
for w1 in list1:
stw1 = stem(w1)
for w2 in list2:
stw2 = stem(w2)
if stw1 == stw2:
count += 1
return (1.0*count)/(1.0*min(len(list1), len(list2)))
开发者ID:giahy2507,项目名称:multsum,代码行数:11,代码来源:summarize_from_db.py
示例19: form_regex_for_common_words
def form_regex_for_common_words():
expr = ""
count = 0
common_words = fp.read().split()
for word in common_words:
count+= 1
if count == len(common_words):
expr += "^"+stem(word)+"$"
else:
expr += "^"+stem(word)+"$|"
return expr
开发者ID:sahilkumarmaths,项目名称:IR,代码行数:11,代码来源:stop_words.py
示例20: getDocTf
def getDocTf(fileName, stopwords):
doc = dict()
with open(fileName, "r") as fi:
for line in fi:
for word in re.split("[^a-zA-Z0-9]", line.strip()):
word = word.lower()
if word != "" and word!="'" and stem(word) not in stopwords:
if doc.get(stem(word), 0) == 0:
doc[stem(word)] = 1
else:
doc[stem(word)] = doc[stem(word)]+1
return doc
开发者ID:imsorry1121,项目名称:tdt,代码行数:12,代码来源:ir.py
注:本文中的stemming.porter2.stem函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论