• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

Python snowball.SnowballStemmer类代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中nltk.stem.snowball.SnowballStemmer的典型用法代码示例。如果您正苦于以下问题:Python SnowballStemmer类的具体用法?Python SnowballStemmer怎么用?Python SnowballStemmer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。



在下文中一共展示了SnowballStemmer类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: _text_to_words

    def _text_to_words(self, text):
        '''
        Processe un texte et retourne une liste de mots
        Le processing effectue les actions suivantes:
            - mise en minuscule du texte
            - tokenisation
            - retrait des stop_words
            - stemming des mots
        '''
        # On met le texte en minuscule
        text = text.lower().strip()

        # Tokenisation
        tokens = word_tokenize(text, language="english")

        # On retire les mots commencant par une apostrophe
        # (la tokenization transforme I'd like en ["I", "'d", "like"]
        #  et on pourrait se passer de "'d")
        tokens = [token for token in tokens if not token.startswith("'")]

        # stop_words
        # On retire les stop words de notre vecteur.
        # En plus des stopwords donnees avec la collection, je rajoute les mots courants
        # Anglais donnés par NLTK et la ponctuation (sauf parantheses car utile pour query bool)
        stop_words = self.stop_words + list(string.punctuation) + stopwords.words("english")
        tokens = [token for token in tokens if token not in stop_words]

        # Stemming
        stemmer = SnowballStemmer(language="english")
        tokens = [stemmer.stem(word) for word in tokens]

        return tokens
开发者ID:rcatajar,项目名称:recherche-web-ecp,代码行数:32,代码来源:index.py


示例2: get_stemm_tags

	def get_stemm_tags(self, tags):
		stemm_tags = []
		current_stemmer = SnowballStemmer('english')
		for tag in self.tags:
			stemm_tags.append(current_stemmer.stem(tag.lower()))
		
		return stemm_tags
开发者ID:lszyman,项目名称:PressNotePropagation,代码行数:7,代码来源:cluster.py


示例3: parseOutText

def parseOutText(f):
    """ given an opened email file f, parse out all text below the
        metadata block at the top
        (in Part 2, you will also add stemming capabilities)
        and return a string that contains all the words
        in the email (space-separated)
        example use case:
        f = open("email_file_name.txt", "r")
        text = parseOutText(f)
        """


    f.seek(0)  ### go back to beginning of file (annoying)
    all_text = f.read()

    ### split off metadata
    content = all_text.split("X-FileName:")
    words = ""

    stemmer = SnowballStemmer("english")
    if len(content) > 1:
        text_string = content[1].translate(string.maketrans("", ""), string.punctuation)
        
        split = text_string.split()  
        text = [stemmer.stem(word) for word in split]
        words = ' '.join(text)


    f.close()

    return words.strip()
开发者ID:AbhinavJain13,项目名称:Udacity,代码行数:31,代码来源:parse_out_email_text.py


示例4: tokenize

def tokenize(s, stem=True, digit=False, stop=True, use_re=False):
    """
    :type s: str
    :type stem: bool
    :type use_re: bool
    :rtype: set(str)
    """
    stop_words = stopwords.words('english')
    stemmer = SnowballStemmer('english')
    wordnet = WordNetLemmatizer()
    table = string.maketrans("","")

    if use_re:
        s = re.sub('(.)([A-Z][a-z]+)', r'\1 \2', s)

    if digit:
        tokens = set(word_tokenize(unify_units(s).translate(table, string.punctuation + string.digits)))
    else:
        tokens = set(word_tokenize(unify_units(s).translate(table, string.punctuation)))

    if stop:
        tokens = set(word for word in tokens if word not in stop_words)

    if stem:
        tokens = set(stemmer.stem(word) for word in tokens)

    return tokens
开发者ID:lingcheng99,项目名称:search-term-relevance-home-depot,代码行数:27,代码来源:preprocess.py


示例5: tokenize

    def tokenize(self, document):
        """
        Break text into sentences and each sentence into a list of single words
        Ignore any token that falls into the stopwords set.
        """
        # use sentence tokenizer sent_tokenize from nltk package
        sentences = sent_tokenize(utils.to_unicode(document.lower()))

        # create stemmer of class SnowballStemmer
        stemmer = SnowballStemmer("english")

        for sentence in sentences:
            words = [word
                   for word in utils.tokenize(
                    self.cleanse_text(sentence)
                   )]

            if self.remove_stopwords:
                words = [ 
                         word for word in words 
                         if word not in self.en_stopwords
                        ]

            if self.stemming:
                words = [stemmer.stem(t) for t in words]

            yield words
开发者ID:RajeshThallam,项目名称:job-fiction,代码行数:27,代码来源:model_building.py


示例6: pos_tokenizer

def pos_tokenizer(s): #define a tokenizer that uses POS tagging
    texts=nltk.word_tokenize(s)

    texts=[word for word in texts if len(word)>2]

    # PULL OUT NOUN AND VERB PHRASES
    chunktext=nltk.pos_tag(texts)
    patterns="""
                VP:{<V.*><DT>?<JJ.*>?<NN.*>}
                NP:{<DT>?<JJ>*<NN.*>}
                N:{<NN.*>}
    """
    NPchunker=nltk.RegexpParser(patterns)

    from nltk.stem.snowball import SnowballStemmer
    st=SnowballStemmer('english')

    #print text
    temp=[]
    result=NPchunker.parse(chunktext)
    #print result
    for phrase in result:
        try:
            phrase.label()
            string=''
            m=0
            for word in phrase:
                if m==0:
                    string+=st.stem(word[0])
                    m+=1
                else: string+=' '+st.stem(word[0])
            temp.append(string)
        except: pass
    return temp
开发者ID:ecpaulson,项目名称:Intuitive-CMS,代码行数:34,代码来源:SBA_tweet_sklearn.py


示例7: text_cleaner_and_tokenizer

def text_cleaner_and_tokenizer(texts):
    """
    takes a list of sentences, removes punctuation, numbers, stopwords and stems.
    Then joins everything back together and returns the filtered texts as a list of unicode strings
    :param texts: list of unprocessed strings
    :return: list of unicode strings
    """
    i = 0
    stopword_list = set(stopwords.words('danish'))
    stemmer = SnowballStemmer("danish", ignore_stopwords=False)
    filtered_texts = []

    for sentence in texts:
        for symbol in punctuation:
            sentence = sentence.replace(symbol,'')
        for num in numbers:
            sentence = sentence.replace(str(num),'')
        sentence = sentence.decode('utf-8').lower()
        words_in_sentence = word_tokenize(sentence, language='danish')
        filtered_sentence = []
        for word in words_in_sentence:
            if word not in stopword_list:
                stem_word = stemmer.stem(word)
                filtered_sentence.append(stem_word)

        sentence = ' '.join(filtered_sentence)
        filtered_texts.append(sentence)

        i = i +1
        if i % 1000 == 0:
            print(i)
    print('Done :D!')
    return filtered_texts
开发者ID:AdamHede,项目名称:text_cnn,代码行数:33,代码来源:data_helpers.py


示例8: tokenize

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = []
    stemmer = SnowballStemmer("english", ignore_stopwords=True)
    for item in tokens:
        stems.append(stemmer.stem(item))
    return stems
开发者ID:anuragxel,项目名称:wiki-doc-classification,代码行数:7,代码来源:docvec_extractor.py


示例9: read_corpus

def read_corpus(corpus_file, use_sentiment):
	"Reads in the corpus and returns the documents and labels"
	documents = []
	labels = []
	with open(corpus_file, encoding='utf-8') as f:
		for line in f:
			tokens = line.strip().split()
			use_stopword = False
			if use_stopword:
				stopwordfile = open('stopwords.txt', 'r')
				stopwords = []
				for line in stopwordfile:
					if len(line) > 0:
						splitline = line.split(',')
						for word in splitline:
							stopwords.append(word)

				tokenlist = [token for token in tokens[3:] if token not in stopwords]
				documents.append(find_ngrams(tokenlist, 2))
			else:
				snowballstemmer = SnowballStemmer('english')
				stemmedtokens = [snowballstemmer.stem(word) for word in tokens[3:]]
				#documents.append(stemmedtokens)
				documents.append(find_ngrams(stemmedtokens, 2))
			if use_sentiment:
				# 2-class problem: positive vs negative
				labels.append( tokens[1] )
			else:
				# 6-class problem: books, camera, dvd, health, music, software
				labels.append( tokens[0] )

	return documents, labels
开发者ID:Martbov,项目名称:LearningFromData,代码行数:32,代码来源:LFDassignment5_SVM_Mart.py


示例10: clean_data

def clean_data(data):
    '''
    Stems and removes stop words from training and test data
    '''
    stemmer = SnowballStemmer('english')
    stop = stopwords.words('english')
    for column_name in ['query', 'product_title', 'product_description']:
        for index, row in data.iterrows():
            warnings.filterwarnings('error')
            try:
                extracted_data = (' ').join(
                    [i for i in BeautifulSoup(row[column_name], 'lxml')
                    .get_text(' ')
                    .split(' ')
                    ])
            except UserWarning:
                pass
            cleaned_data = re.sub('[^a-zA-Z0-9]',' ', extracted_data)
            stemmed_data = (' ').join(
                [stemmer.stem(i) for i in cleaned_data.split(' ')
                ])
            remove_stop_words = ('').join(
                [i for i in stemmed_data if i not in stop]
                )
            data.set_value(index, column_name, unicode(remove_stop_words))
    return data
开发者ID:arvin-dwarka,项目名称:Udacity_Machine_Learning_Engineer,代码行数:26,代码来源:main.py


示例11: get_core_words

def get_core_words( text ):

    #TOKENIZATION
    b = word_tokenize(text)

    #KEEP ONLY NOUNS
    b = [noun for noun, pos in pos_tag(b) if pos.startswith('N')]

    #CONVERT INTO LOWER CASE
    looper = 0
    for token in b:
        b[looper] = token.lower()
        looper+=1

    #REMOVE THE STOPWORDS FROM THE FILE
    minlength = 2
    c = [token for token in b if (not token in stopwords.words('english')) and len(token) >= minlength]

    #STEMMING THE WORDS TO ITS BASE FORM
    stemmer = SnowballStemmer("english")
    looper1 = 0
    for token in c:
        c[looper1] = stemmer.stem(token.decode("utf8"))
        looper1 +=1
    return c
开发者ID:MoMenne,项目名称:global-hack-II,代码行数:25,代码来源:parser.py


示例12: prune

def prune(doc, stoplist = None, stem = True, english_dictionary_words = False):
    """This takes a single document and tokenizes the words, removes
    undesirable elements, and prepares it to be loaded into a dictionary.
    """
    # Tokenize the document and make it lowercase
    temp = utils.simple_preprocess(doc.lower())

    # Remove freestanding punctuation and punctuation in words
    temp = [w for w in temp if w not in string.punctuation]
    temp = [rmPunct(w) for w in temp]

    # Remove words in passed stoplist
    if stoplist:
        temp = [w for w in temp if w not in stoplist]

    # Remove specific tokens
    temp = [w for w in temp if w not in set(['[', ']', "'", '\n', 'com'])]

    # Remove stopwords
    temp = [w for w in temp if w not in stopwords.words('english')]

    # Stem the remaining words
    if stem:
        stemmer = SnowballStemmer('english')
        temp = [stemmer.stem(w) for w in temp]

    if english_dictionary_words:
        d = enchant.Dict("en_US")
        temp = [w for w in temp if d.check(w)]
    return temp
开发者ID:Centaurific,项目名称:Blender,代码行数:30,代码来源:text_fun.py


示例13: processFile

def processFile(fh):
    with gzip.open(fh, 'rb') as f:
        tree = etree.parse(f)
        root = tree.getroot()        
        r = re.compile('^[a-zA-Z]+$')
        s = SnowballStemmer("english")

        paragraphs = root.xpath('DOC[@type="story"]/TEXT/P')        
        
        for p in paragraphs:            
            try:
                sentences = PunktSentenceTokenizer().sentences_from_text(p.text)

                for sentence in sentences:                
                    tokens = TreebankWordTokenizer().tokenize(sentence)

                    #Filter by alphabetic only
                    alphabetic = filter(r.match, tokens)
                    #Filter by stopwords & stem all leftover tokens
                    stop_filtered = [s.stem(w) for w in alphabetic if w.lower() not in stopwords.words('english')]

                    print (" ").join(stop_filtered).upper()
            except:
                continue        


    return True
开发者ID:hweej,项目名称:text-norm,代码行数:27,代码来源:gigaprep.py


示例14: parseOutText

def parseOutText(f):
    """ given an opened email file f, parse out all text below the
        metadata block at the top
        
        example use case:
        f = open("email_file_name.txt", "r")
        text = parseOutText(f)
        
        """

    stemmer = SnowballStemmer("english")
    
    f.seek(0)  ### go back to beginning of file (annoying)
    all_text = f.read()

    ### split off metadata
    content = all_text.split("X-FileName:")
    words = ""
    if len(content) > 1:
        ### remove punctuation
        text_string = content[1].translate(string.maketrans("", ""), string.punctuation)

        ### split the text string into individual words, stem each word,
        ### and append the stemmed word to words (make sure there's a single
        ### space between each stemmed word)
        
        words = ' '.join([stemmer.stem(word) for word in text_string.split()])
        
    return words
开发者ID:omoju,项目名称:udacityUd120Lessons,代码行数:29,代码来源:vectorizer.py


示例15: __init__

 def __init__(self,df, column,n ): # gets the most frecuent words in a document
   
     texto = " ".join(str(x) for x in df[column].values)
     tokens = texto.split()
     tokens=[x.lower() for x in tokens]
     #stopset = set(stopwords.words('english')) # dictionary of stop words
     #tokens = [w for w in tokens if not w in stopset]
     stemmer=SnowballStemmer("english")
     stemm_words=[]
     tokens_clean=[]
     for j in tokens:
       
       sa=re.sub('[^A-Za-z]+', '', j)
       tokens_clean.append(sa)
     #print tokens_clean
     for s in tokens_clean:
       try:
         stem= stemmer.stem(s)
         if s!='':
          stemm_words.append(str(stem)) 
       except:
         pass
     cuenta = len(tokens_clean)
     largo =  Counter(stemm_words).most_common(n)
     topdic = dict(largo)
     asortado = Series(topdic)
     asortadol = asortado.columns = ['a', 'b']
     ordenado = asortado.order(ascending=False)
     ordenadolist= topdic.keys() #+stemm_words
     self.top=ordenadolist
开发者ID:omedranoc,项目名称:ThesisPreprocessing,代码行数:30,代码来源:topwords.py


示例16: clean_text

def clean_text(list_o_text):
    docs = [''.join([char if char not in punctuation else ' ' for char in 
                     comic]) for comic in list_o_text]

    # remove punctuation from string
    docs = [word_tokenize(comic) for comic in docs]
    # make string into list of words

    # 3. Strip out stop words from each tokenized document.
    stop = set(stopwords.words('english'))
    stop.update(punctuation)
    other_words = ['cite', 'cite_note', 'cite_ref', 'class', 'href', 'id', 
                   'redirect', 'ref', 'refer', 'span', 'sup', 'title', 'wiki']
    stop.update(other_words)
    docs = [[word for word in words if word.strip(punctuation) not in stop] 
            for words in docs]
    # remove stop words
    
    # Stemming / Lemmatization
    # 1. Stem using both stemmers and the lemmatizer
    #porter = PorterStemmer()
    snowball = SnowballStemmer('english')
    #wordnet = WordNetLemmatizer()
    #docs_porter = [[porter.stem(word) for word in words] for words in docs]
    docs_snowball = [[snowball.stem(word) for word in words] for words in docs]
    #docs_wordnet = [[wordnet.lemmatize(word) for word in words] for words in docs]
    docs = [' '.join(doc) for doc in docs_snowball]
    # for each document, it becomes a long string
    return docs
开发者ID:eugeneh101,项目名称:comix-nexus,代码行数:29,代码来源:tfidf_corpus.py


示例17: stem_stopword_clean

def stem_stopword_clean( vett_strings ):
    '''
    Prende un vettore di studenti o lavori ongli elemento delle lista unico e stemmato.
    Divide elementi composti da piu parole, rimuove le STOPwords
    :param vett_value: vettore di stringhe
    :return: vettore di parole stem senza stopwords
    '''

    # importo libreria per stem
    from nltk.stem.snowball import SnowballStemmer
    from nltk.corpus import stopwords

    stemmer = SnowballStemmer("italian")

    stop = set(stopwords.words('italian'))

    # logger.error(stemmer.stem("italian"))
    # logger.error(stemmer.stem("a"))
    # logger.error(stemmer.stem("andate tutti a correre"))

    documents=[]

    # logger.error(stop)

    stem_parola=''

    for frasi in vett_strings:
        for parola in frasi.split(" "):
            stem_parola=stemmer.stem(parola)
            if(stem_parola not in stop and stem_parola not in documents):
                documents.append(stem_parola)


    return documents
开发者ID:delpiomat,项目名称:pysurvey,代码行数:34,代码来源:recommendation.py


示例18: cleaned_bag_of_words_dataset

def cleaned_bag_of_words_dataset(data_matrix, stemming=False, stop_words=None, TFIDF=False, ngram_range=(1, 1), max_features=None,
                                 length=False, number_in_tweet=False, words_present=[]):
    if stemming:
        stemmer = SnowballStemmer("english")
        tweets = [" ".join([stemmer.stem(word) for word in word_tokenize(data_point[2].lower().decode("utf8"))]) for data_point in data_matrix]
    else:
        tweets = [data_point[2].lower() for data_point in data_matrix]
        
    if TFIDF:
        vectorizer = TfidfVectorizer(stop_words=stop_words, ngram_range=ngram_range, max_features=max_features)
    else:
        vectorizer = CountVectorizer(stop_words=stop_words, ngram_range=ngram_range, max_features=max_features)
    
    dataset = vectorizer.fit_transform(tweets).toarray()
    
    if length:
        lengths = np.array([[len(word_tokenize(data_point[2].decode("utf8")))] for data_point in data_matrix])
        dataset = np.concatenate((dataset, lengths), axis=1)
     
    if number_in_tweet:
        numbers = []
        for data_point in data_matrix:
            number_list = list_of_ints_from_string(data_point[2])
            filtered_number_list = [number for number in number_list if abs(number) < 10]
            if len(filtered_number_list) == 0:
                numbers.append([0])
            else:
                numbers.append([np.mean(filtered_number_list)])
        dataset = np.concatenate((dataset, numbers), axis=1)

    for word in words_present:
        word_present = np.array([[int(word.lower() in word_tokenize(data_point[2].lower().decode("utf8")))] for data_point in data_matrix])
        dataset = np.concatenate((dataset, word_present), axis=1)
        
    return dataset
开发者ID:miljanm,项目名称:Political-Twitter-Sentiment-Mining,代码行数:35,代码来源:preprocessing.py


示例19: test_spanish

    def test_spanish(self):
        stemmer = SnowballStemmer('spanish')

        assert stemmer.stem("Visionado") == 'vision'

        # The word 'algue' was raising an IndexError
        assert stemmer.stem("algue") == 'algu'
开发者ID:Journo-App,项目名称:flask-by-example,代码行数:7,代码来源:test_stem.py


示例20: parseOutText

def parseOutText(f):
    """ given an opened email file f, parse out all text below the
        metadata block at the top
        and return a string that contains all the words
        in the email (space-separated)
        example use case:
        f = open("email_file_name.txt", "r")
        text = parseOutText(f)
        """
    f.seek(0)
    all_text = f.read()

    ### split metadata off
    content = all_text.split("X-FileName:")
    words = ""
    st = ""
    if len(content) > 1:
        text_string = content[1].translate(string.maketrans("", ""), string.punctuation)
        stemmer = SnowballStemmer("english")
#	print tree
    for word in text_string.split():
        st = st+" "+(stemmer.stem(word))

    words = st.lstrip()
    return words
开发者ID:utsri,项目名称:Machine_Learning_Projects,代码行数:25,代码来源:parse_out_email_text.py



注:本文中的nltk.stem.snowball.SnowballStemmer类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python wordnet.WordNetLemmatizer类代码示例发布时间:2022-05-27
下一篇:
Python snowball.EnglishStemmer类代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap