本文整理汇总了Python中stop_words.get_stop_words函数的典型用法代码示例。如果您正苦于以下问题:Python get_stop_words函数的具体用法?Python get_stop_words怎么用?Python get_stop_words使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了get_stop_words函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: load_stoplist
def load_stoplist(topic_words=False, lang="en"):
try:
if lang == "en":
if topic_words: return set(get_stop_words("en") + STOP_LIST + get_topic_stoplist())
else: return set(get_stop_words("en") + STOP_LIST + stopwords.words('english'))
elif lang == "nl":
return set(get_stop_words("nl") + stopwords.words('dutch') + STOP_LIST_NL)
except:
print "warning: no stopwords were downloaded. check nltk corpora"
print format_exc()
return set()
开发者ID:anukat2015,项目名称:taxi,代码行数:11,代码来源:morph.py
示例2: test_filters
def test_filters(self):
language = 'en'
before = get_stop_words(language, False)
letter = random.choice(random.choice(before))
def remove_letter(stopwords, language):
return [word for word in stopwords if letter not in word]
stop_words.add_filter(remove_letter)
after = get_stop_words(language, False)
for stopword in after:
self.assertFalse(letter in stopword)
self.assertTrue(stop_words.remove_filter(remove_letter))
开发者ID:Alir3z4,项目名称:python-stop-words,代码行数:12,代码来源:tests.py
示例3: get_most_freq
def get_most_freq(all_comments):
APP_ROOT = os.path.dirname(os.path.abspath(__file__))
APP_STATIC = os.path.join(APP_ROOT, 'static')
file_name = os.path.join(APP_STATIC, 'freq_portugues.p')
dict_freq = pickle.load(open(file_name, "rb" ) )
web_stopWords = ["q","vc","vcs","tipo","ta","pra","pq","ne","sobre","ser","cara","la"]
all_comments = remove_accents(all_comments)
tokens = all_comments.split()
#build token dictionary
dict_tokens = {}
for token in tokens:
if token in dict_tokens:
dict_tokens[token] += 1
else:
dict_tokens[token] = 1
#remove stop words
stopWords = get_stop_words('portuguese', cache=True)
stopWords += get_stop_words('english', cache=True)
stopWords += web_stopWords
#remove stop words
for word in stopWords:
dict_tokens.pop(remove_accents(word), None)
#for word in dict_tokens:
# print(dict_tokens[token])
# dict_tokens[token] = 1+math.log(dict_tokens[token])
#sorted by frequency
sorted_tokens = sorted(dict_tokens.items(), key=operator.itemgetter(1),reverse=True)
num_tokens = int(min(len(sorted_tokens)/2, 1000))
sorted_tokens = sorted_tokens[0:num_tokens]
#normalize by frequency
standart_frequency = dict_freq["acelga"]
for i in range(len(sorted_tokens)):
(token,value) = sorted_tokens[i]
if token in dict_freq:
sorted_tokens[i] = (token, math.log(value/dict_freq[token]))
else:
sorted_tokens[i] = (token,math.log(value/standart_frequency))
sorted_tokens_after = sorted(sorted_tokens,key=operator.itemgetter(1), reverse=True)
max_num_words = 100
sorted_tokens_after = sorted_tokens_after[0:max_num_words]
return sorted_tokens_after
开发者ID:FaceBattle,项目名称:FaceBattle-TLDR,代码行数:52,代码来源:text_analise.py
示例4: test_get_stop_words_cache
def test_get_stop_words_cache(self):
self.assertFalse('french' in stop_words.STOP_WORDS_CACHE)
sw = get_stop_words('fr')
self.assertTrue('french' in stop_words.STOP_WORDS_CACHE)
original_stop_words_dir = stop_words.STOP_WORDS_DIR
stop_words.STOP_WORDS_DIR = 'not-existing-directory'
self.assertEqual(sw, get_stop_words('french'))
stop_words.STOP_WORDS_DIR = original_stop_words_dir
try:
get_stop_words('klingon')
except:
pass
self.assertFalse('klingon' in stop_words.STOP_WORDS_CACHE)
开发者ID:Alir3z4,项目名称:python-stop-words,代码行数:13,代码来源:tests.py
示例5: word_list
def word_list(text ) :
list = {}
words = text.split()
stop_words = get_stop_words('en') # stop words is a list of common words used in English
stop_words = get_stop_words('english')
words = [word for word in words if word not in stop_words] #removing stop words
for i in words:
if all(j.isdigit() for j in i): # classifing token as number feature
if list.has_key("NUMBER"):
list["NUMBER"]+=1
else:
list["NUMBER"]=1
elif (len (i) >=4 and i[0] == 'h' and i[1] == 't' and i[2] == 't' and i[3] == 'p'):
if list.has_key("LINKS"): # classifing token as link feature
list["LINKS"]+=1
else:
list["LINKS"]=1
elif all(j in string.punctuation for j in i):
if list.has_key("PUNCTUATION"): # classifing token as punctuation feature
list["PUNCTUATION"]+=1
else:
list["PUNCTUATION"]=1
elif len(i.translate(None,string.punctuation)) < 3:
continue
elif i.upper()==i:
if list.has_key("CAPSLOCK"): # classifing token as capital word feature
list["CAPSLOCK"]+=1
else:
list["CAPSLOCK"]=1
else:
j = i.translate(None,string.punctuation).lower()
if list.has_key(j):
list[j]+=1
else:
list[j]=1
return list
开发者ID:saurabhanand1995,项目名称:Spam-Filter,代码行数:48,代码来源:spamFilter.py
示例6: lemmatization_intern
def lemmatization_intern(lang, rss, result, doc):
# Construction et configuration du wrapper
tagger = treetaggerwrapper.TreeTagger(TAGLANG=lang, TAGDIR=treetagger_path,
TAGINENC='utf-8', TAGOUTENC='utf-8')
# Utilisation
tags = tagger.TagText(rss)
data = formatTTG(tags, tagger, stop_words.get_stop_words(language=lang))
for k in [1, 2, 3]:
i = 0
liste = []
while i <= len(data) - k:
lemma = getLemma(data[i])
for j in range(k - 1):
lemma += " " + getLemma(data[i + j + 1])
if lemma not in result:
result[k-1][lemma] = 0
doc[k-1][lemma] = 1
liste += [lemma]
elif lemma not in liste:
doc[k-1][lemma] += 1
liste += [lemma]
result[k-1][lemma] += 1
i += 1
return result, doc
开发者ID:Flasheur111,项目名称:SEO,代码行数:28,代码来源:lemmatization.py
示例7: get_stopset
def get_stopset():
"""
Gets a set of stopwords
"""
stopset = set(get_stop_words('en'))
# get those contractions
add_stops = nltk.word_tokenize(' '.join(stopset))
stopset.update(add_stops)
# make sure to get contractions without punctuation, so that
# order of operations doesn't matter later
add_stops = [stopword.strip(string.punctuation)
for stopword in stopset]
stopset.update(add_stops)
# custom stop words
add_stops = [u'lp', u'ep',
u'record', u'records', u'recorded'
u'label', u'labels',
u'release', u'releases', u'released',
u'listen', u'listens', u'listened', u'listener',
u'version', u'versions',
u'album', u'albums',
u'song', u'songs',
u'track', u'tracks',
u'sound', u'sounds',
u'thing', u'things', u'something',
u'music']
stopset.update(add_stops)
return stopset
开发者ID:lwoloszy,项目名称:albumpitch,代码行数:31,代码来源:text_preprocess.py
示例8: get_frequency
def get_frequency(self):
# Selecting all the text in the database
cursor = self.select_content('Content')
# Initialising variables
words = []
count_handle = Counter()
# Generating common word list to be removed from the keyword list to be generated
sw = stop_words.get_stop_words("english")
# Extracting all words from the given database
for row in cursor:
words += re.compile('\w+').findall(row[1])
#Remove stop words from 'words' list
words = [w.lower() for w in words if w.lower() not in sw]
# Calculating the frequency of all words in the given database
for w in words:
count_handle[w] += 1
# Writing the keywords returned into the file = category+ "_keyword.txt"
with open(self.out, 'w') as file_name:
for word in count_handle.most_common(self.limit):
file_name.write(word[0]+"\t"+str(word[1])+"\n")
开发者ID:jadeseeker,项目名称:Domain-Analyzer,代码行数:27,代码来源:getKeywords.py
示例9: issue_analysis
def issue_analysis(df):
df_sub = df[['Issue']]
df_sub.insert(0, 'count', 1)
Issue_List=[]
for i in range(0,50):
Issue_List.append(df_sub.groupby(['Issue']).sum().sort_index(by='count', ascending=False).ix[i].name)
tokenizer = RegexpTokenizer(r'[A-Za-z0-9\']+') # set tokenize Reg
en_stop = get_stop_words('en') # create English stop words list
p_stemmer = PorterStemmer() # Create p_stemmer of class PorterStemmer
texts = [] # list for tokenized documents in loop
text_view = ''
# loop through document list
for i in Issue_List:
# clean and tokenize document string
raw = i.lower()
tokens = tokenizer.tokenize(raw)
# remove stop words from tokens
stopped_tokens = [i for i in tokens if not i in en_stop]
# stem tokens and add them to list
stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
texts.append(stemmed_tokens)
#print ' '.join(stemmed_tokens)
text_view += ' '.join(stemmed_tokens)
text_view += ' '
wordcloud = WordCloud().generate(text_view)
fig = plt.figure(figsize=(8,6))
fig1 = fig.add_subplot(1,1,1)
fig1.set_title("Top issued words", fontdict={'fontsize':25})
fig1.imshow(wordcloud)
fig1.axis("off")
#plt.savefig('ComplainCount_WC.png')
plt.savefig('ComplainCount_WC_2016.png')
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]
# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=25, id2word = dictionary)
LDAText = ldamodel.print_topics(num_topics=5, num_words=3)
#print "\n Topic analysis result for top 25 issues with LDA"
#print(LDAText)
vis_data = gensimvis.prepare(ldamodel, corpus, dictionary)
#pyLDAvis.show(vis_data)
#pyLDAvis.save_html(vis_data, "issue_lda.html")
#pyLDAvis.save_json(vis_data, "issue_lda.json")
pyLDAvis.save_html(vis_data, "issue_lda_2016.html")
pyLDAvis.save_json(vis_data, "issue_lda_2016.json")
return 0
开发者ID:choi-junhwan,项目名称:ConsumerComplaintsDataProject,代码行数:60,代码来源:Complaints_TextAnalysis.py
示例10: cal_idf_overlap
def cal_idf_overlap():
list_subj = utils.list_subject
ls_distance_final = []
ls_distance_row = []
#print len(list_att)
stop_words = get_stop_words('en')
tmp_corpus = []
for i in range(len(list_subj)):
item = str(list_subj[i]).split(" ")
for token in item:
if token in stop_words:
pass
else:
tmp_corpus.append(token)
#print "corpus", corpus
length = len(list_subj)
for i in range(0, length):
if i == 500 or i == 1000 or i == 1500:
print i
for j in range(0, length):
print i, j
idf_instance = IDF.IDF(str(list_subj[i]),str(list_subj[j]), tmp_corpus)
distance = idf_instance.cal_overlap()
ls_distance_row.append(distance)
ls_distance_final.append(ls_distance_row)
ls_distance_row = []
myarray = np.asarray(ls_distance_final)
print myarray
Z = linkage(myarray, "ward")
thefile = open('/Users/Aaron/test.txt', 'w')
for item in Z:
thefile.write("%s\n" % item)
plt.figure(figsize=(25, 10))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('sample index')
plt.ylabel('distance')
dendrogram(
Z,
leaf_rotation=90., # rotates the x axis labels
leaf_font_size=8., # font size for the x axis labels
)
plt.show()
plt.title('Hierarchical Clustering Dendrogram (truncated)')
plt.xlabel('sample index')
plt.ylabel('distance')
dendrogram(
Z,
truncate_mode='lastp', # show only the last p merged clusters
p=30, # show only the last p merged clusters
show_leaf_counts=True, # otherwise numbers in brackets are counts
leaf_rotation=90.,
leaf_font_size=12.,
show_contracted=True, # to get a distribution impression in truncated branches
)
plt.show()
开发者ID:ycraaron,项目名称:CanonicalizationOKB,代码行数:60,代码来源:hac_idf_overlap.py
示例11: getWordVector
def getWordVector(inputString):
tokenizer = RegexpTokenizer(r'\w+\'?\w+')
# default English stop words list
en_stop = get_stop_words('en')
# Create p_stemmer of class PorterStemmer
# It is considered to be the best for finding word roots
p_stemmer = PorterStemmer()
raw = inputString.lower()
tokens = tokenizer.tokenize(raw)
# remove stop words from tokens
stopped_tokens = [i for i in tokens if not i in en_stop]
# now POS words which are nouns, adjectives, adverbs and verbs
pos_tagged = nltk.pos_tag(stopped_tokens)
# stem tokens
# p_stemmer.stem(i[0]) and other additions in if condition - or i[1][0] == 'R' or i[1][0] == 'V'
stemmed_tokens = [i[0]
for i in pos_tagged
if i[1][0] == 'N'] # or i[1][0] == 'J']
return stemmed_tokens
开发者ID:pralhadsapre,项目名称:Yelp-Project,代码行数:27,代码来源:TopicModeler.py
示例12: lda_approach_one
def lda_approach_one():
tokenizer = RegexpTokenizer(r'\w+')
en_stop = get_stop_words('en')
p_stemmer = PorterStemmer()
# doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
# doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
# doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
# doc_e = "Health professionals say that brocolli is good for your health."
# doc_set = [doc_a, doc_b, doc_c, doc_e]
print db.find().count()
doc_set = [i['abstract'] for i in db.find()]
texts = []
for i in doc_set:
raw = i.lower()
tokens = tokenizer.tokenize(raw)
stopped_tokens = [i for i in tokens if not i in en_stop]
stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
texts.append(stemmed_tokens)
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
ldamodel = gensim.models.ldamodel.LdaModel(
corpus,
num_topics=4,
id2word=dictionary,
passes=20
)
print ldamodel.print_topics(10)
开发者ID:nikhil2kulkarni,项目名称:biz,代码行数:27,代码来源:general_lda_word2vec.py
示例13: get_corpus
def get_corpus():
db_conn = MySQLdb.connect(host="localhost", port=8889, db="linked_reverb", user="root", passwd="root")
cursor = db_conn.cursor()
cursor.execute("select argument1, argument2 from linked_entity80_a")
ls_result = []
ls_corpus = []
row_count = int(cursor.rowcount)
for i in range(0, row_count):
row = cursor.fetchone()
ls_result.append(row)
stop_words = get_stop_words('en')
for i in range(len(ls_result)):
for item in ls_result[i][0].split(" "):
if item in stop_words:
pass
else:
ls_corpus.append(item)
for item in ls_result[i][1].split(" "):
if item in stop_words:
pass
else:
ls_corpus.append(item)
#
# ls_corpus.append(ls_result[i][0].split(" "))
# ls_corpus.append(ls_result[i][1].split(" "))
db_conn.close()
return ls_corpus
开发者ID:ycraaron,项目名称:CanonicalizationOKB,代码行数:33,代码来源:main.py
示例14: convert_amazon_to_dict
def convert_amazon_to_dict(dict_field, is_text, in_fname, out_fname):
id = 0
num_entries = 0
field_dict = {'':0}
stop_words = get_stop_words('en')
for entry in parse_amazon(in_fname):
if entry.has_key(dict_field):
num_entries += 1
# if text field, parse and populate.
if is_text:
words = entry[dict_field].split()
for word in words:
stemmed_word = stem(word)
if stemmed_word not in stop_words and stemmed_word not in field_dict:
id += 1
field_dict[stemmed_word] = id
else:
if entry[dict_field] not in field_dict:
id += 1
field_dict[entry[dict_field]] = id
#printf('%s -> %d\n', entry[dict_field], id)
#if id > 100:
# break
print "num_entries:", num_entries
print "length of field_dict:", len(field_dict)
with open(out_fname, 'wb') as outf:
pickle.dump(field_dict, outf)
开发者ID:fruitfly1026,项目名称:tensors,代码行数:28,代码来源:process.py
示例15: load_dataset
def load_dataset(dataset_file):
"""
It is more efficient (O(n) vs. O(1)) to search a dictionary or a set
compared to a list as they are implemented with a hash.
Therefore, the dataset is kept with 2 dictionaries where
the values are sets.
"""
items_original_form = defaultdict(set)
items_by_keyword_start = defaultdict(set)
items_by_id = defaultdict(set)
stop_words = get_stop_words('english')
with open(dataset_file) as f:
lines = csv.reader(f, delimiter=',')
for line in lines:
item_id, *descriptors = line
# save original form (3 seperate fields:
# id, description, company name) for output
items_original_form[item_id] = descriptors
# create 2 dictionaries for searching:
# 1. Key: 3 lower-case first letters of each
# word of item descriptors. Value: item ids.
# 2. Key: item id. Value: item descriptors in lower-case.
descriptors_set = set(" ".join(descriptors).lower().split())
for d in descriptors_set:
if d not in stop_words:
items_by_keyword_start[d[:3]].add(item_id)
items_by_id[item_id] = descriptors_set
return (items_by_keyword_start, items_by_id, items_original_form)
开发者ID:noamba,项目名称:query-dataset,代码行数:34,代码来源:search_app_improved.py
示例16: bag_of_words_vectorizer
def bag_of_words_vectorizer(datafile, k_features):
"""
Computes sparse term-document matrix of datafile documents, selects k best features by chi2 test.
Yields batches of BATCH_SIZE of dense tdm vectors and vector of labels, transformed for keras nn.
"""
data = []
labels = []
for jsoned_entity in open("data.json", errors="ignore").readlines():
entity = json.loads(jsoned_entity)
if entity["lang"] == "en":
data.append(entity["text"])
labels.append(entity["label"])
vectorizer = TfidfVectorizer(stop_words=get_stop_words("english"))
data = vectorizer.fit_transform(data)
data = SelectKBest(chi2, k=k_features).fit_transform(data, labels)
for vector_label_batch in batch(zip(data, labels), config.BATCH_SIZE):
vectors = []
labels = []
for vec_label in vector_label_batch:
vectors.append(vec_label[0].toarray())
labels.append(vec_label[1])
X = np.vstack(vectors)
Y = np_utils.to_categorical(labels, 2)
yield X, Y
开发者ID:WarmongeR1,项目名称:pydigest-classifier,代码行数:28,代码来源:vectorizer.py
示例17: preprocess_wikidata
def preprocess_wikidata(raw):
# Initialize Tokenizer
tokenizer = RegexpTokenizer(r'\w+')
# Initialize Lemmatizer
lemma = WordNetLemmatizer()
# create English stop words list
en_stop = get_stop_words('en')
# Decode Wiki Markup entities and remove markup
text = filter_wiki(raw)
text = re.sub(filter_more, '', text)
# clean and tokenize document string
text = text.lower().split('../img/')[0]
tokens = tokenizer.tokenize(text)
# remove stop words from tokens
tokens = [i for i in tokens if not i in en_stop]
# stem tokens
tokens = [lemma.lemmatize(i) for i in tokens]
# remove non alphabetic characters
tokens = [re.sub(r'[^a-z]', '', i) for i in tokens]
# remove unigrams and bigrams
tokens = [i for i in tokens if len(i)>2]
return (tokens, text)
开发者ID:DailyActie,项目名称:AI_APP_CV-TextTopicNet,代码行数:31,代码来源:preprocess_text.py
示例18: process_line_mymodel
def process_line_mymodel(line):
"""
@params
line: list of all tokens contained in a line
format: id_img nb_pairs(word, points) w1 p1 w2 p2 .... wn pn
return: key, value for the dictionary
key: id_img
value: list of pairs w-p
remove stop words?
"""
en_stop = get_stop_words('en')
#print en_stop
key = line[0]
nb_pairs = int(line[1])
i = 0
value = []
weights = {}
while i<nb_pairs*2:
#print line[2+i]
#if line[2+i] not in en_stop:
value.append(re.sub(r'[^\x00-\x7f]',r'',line[2+i]))
weights[re.sub(r'[^\x00-\x7f]',r'',line[2+i])]=int(line[3+i])
i+=2
#assert nb_pairs == len(value), "length of data diferent (nb_pairs =/= len(pairs))"
return key, value, weights
开发者ID:tmdavid,项目名称:KU_Leuven,代码行数:26,代码来源:file_preparation.py
示例19: textToWordList
def textToWordList(txt):
p_stemmer = RussianStemmer()
tokenizer = RegexpTokenizer(r'\w+')
stop_w = [p_stemmer.stem(i) for i in get_stop_words('ru')]
r = re.compile('^[а-я]+$')
badword =[
'дом',
'город',
"дорог",
"час",
"ноч",
"слов",
"утр",
"стран",
"пут",
"путешеств",
"мест",
'нов',
"друз",
"добр"
]
txt = txt.lower().replace("<br>", "\n")
tokens = [p_stemmer.stem(i) for i in tokenizer.tokenize(txt)]
tokens = [i for i in tokens if not i in stop_w and r.match(i) and not i in badword]
return tokens
开发者ID:Omrigan,项目名称:travelrec,代码行数:25,代码来源:views.py
示例20: createLDAModel
def createLDAModel(texts, n_topics, n_passes):
"""Generates a LDA model from an array of texts
"""
tokenizer = RegexpTokenizer(r'\w+')
#Create EN stop words list
en_stop = get_stop_words('en')
#Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
texts_ = []
# loop through document list
for i in texts:
# clean and tokenize document string
raw = i.lower()
tokens = tokenizer.tokenize(raw)
# remove stop words from tokens
stopped_tokens = [i for i in tokens if not i in en_stop]
# stem tokens
stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
# add tokens to list
texts_.append(stemmed_tokens)
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts_)
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts_]
# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=n_topics, id2word = dictionary, passes=n_passes)
return(ldamodel)
开发者ID:gsi-upm,项目名称:gsitk,代码行数:35,代码来源:textual_features.py
注:本文中的stop_words.get_stop_words函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论