• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

Python tokenize.RegexpTokenizer类代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中nltk.tokenize.RegexpTokenizer的典型用法代码示例。如果您正苦于以下问题:Python RegexpTokenizer类的具体用法?Python RegexpTokenizer怎么用?Python RegexpTokenizer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。



在下文中一共展示了RegexpTokenizer类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: preprocess

def preprocess(TWEETS, typeTweet):
    wordlist = []
    tokenizer = RegexpTokenizer(r'#?\w+') 
    #normalize text -- TOKENIZE USING REGEX TOKENIZER
    cnt = 0
    for item in TWEETS:
        text = TWEETS[cnt]
        tweet = ''.join(text)
        tweet = tweet.lower().strip('\n')
        
        tweet = re.sub(r'[0-9]+', "" , tweet)
        tweet = re.sub(r'@[^\s]+', "" , tweet)
        tweet = re.sub(r'#\w+primary', "" , tweet)                    
        wordlist.extend(tokenizer.tokenize(tweet))
        cnt += 1

    #remove stopwords
    stop = stopwords.words('english') + ['rt', 'via', 'u', 'r', 'b', '2', 'http', 
                                        'https', 'co', 'live', 'hall', 'town', 'watch', 
                                        'tune', 'time', 'tonight', 'today', 'campaign', 
                                        'debate', 'wants', 'without', 'dont', 
                                        '#hillaryclinton', '#berniesanders', '#donaldtrump', 
                                        '#tedcruz', "#johnkasich", '#politics']
    filtered = [term for term in wordlist if term not in stop] 
    filtered_final = [term for term in filtered if len(term)>3] 
    print 'Preprocessed %s tweets' % (typeTweet)
    return filtered_final
开发者ID:martinezmonica123,项目名称:Twitter-Sentiment-Analysis,代码行数:27,代码来源:text_analysis_tweets.py


示例2: lda

def lda(data):
	data = get_only_text(data)
	only_tweet = data
	length = len(only_tweet)
	length = min(20,length)
	for i in xrange(0,length):
		print i
		print only_tweet[i]
	return
	
	tokenizer = RegexpTokenizer(r'\w+')
	en_stop = get_stop_words('en')
	p_stemmer = PorterStemmer()

	length = len(only_tweet)
	length = min(20,length)
	total_texts = []
	for i in xrange(0,length):
		print only_tweet[i]
		print 
		to_lower = only_tweet[i].lower()
		tokens = tokenizer.tokenize(to_lower)
		stopped_tokens = [k for k in tokens if not k in en_stop]
		texts = [p_stemmer.stem(k) for k in stopped_tokens]
		total_texts.append(texts)

	dictionary = corpora.Dictionary(total_texts)
	corpus = [dictionary.doc2bow(text) for text in total_texts]

	ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)
	result =  ldamodel.print_topics(num_topics=2, num_words=1)
	for i in result:
		print i
开发者ID:ChilupuriAnilReddy,项目名称:SMAI_Major_Project,代码行数:33,代码来源:Analysing_Data.py


示例3: textToWordList

def textToWordList(txt):
    p_stemmer = RussianStemmer()
    tokenizer = RegexpTokenizer(r'\w+')
    stop_w = [p_stemmer.stem(i) for i in get_stop_words('ru')]
    r = re.compile('^[а-я]+$')
    badword =[
        'дом',
        'город',
        "дорог",
        "час",
        "ноч",
        "слов",
        "утр",
        "стран",
        "пут",
        "путешеств",
        "мест",
        'нов',
        "друз",
        "добр"
    ]
    txt = txt.lower().replace("<br>", "\n")
    tokens = [p_stemmer.stem(i) for i in tokenizer.tokenize(txt)]
    tokens = [i for i in tokens if not i in stop_w and r.match(i) and not i in badword]
    return tokens
开发者ID:Omrigan,项目名称:travelrec,代码行数:25,代码来源:views.py


示例4: Tokenize

def Tokenize(TextData):
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = list()

    # create English stop words list
    en_stop = get_stop_words('en')

    # Create p_stemmer of class PorterStemmer
    p_stemmer = PorterStemmer()

    # clean and tokenize document string
    raw = TextData.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]

    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    tokens = stemmed_tokens

    TOKENIZEDTEXT_FILE = path.join(os.pardir, "Resources/TokenizedTextFiles/Personal-Narration/Unbroken - Motivational Video.txt")
    fp = open(TOKENIZEDTEXT_FILE, "w")
    print(TOKENIZEDTEXT_FILE)
    # pickle.dump(tokens, fp)
    fp.write(str(tokens))
    fp.close()
开发者ID:prathmeshgat,项目名称:SuicidalPersonDetection,代码行数:27,代码来源:AudioToText.py


示例5: tokenize

    def tokenize(self, doc):
        '''
        use NLTK RegexpTokenizer
        '''

        tokenizer = RegexpTokenizer("\w{3,}")
        return [self.stemmer.stem(x) for x in tokenizer.tokenize(doc)]
开发者ID:Hongtian22,项目名称:Movier,代码行数:7,代码来源:movier.py


示例6: text_process

def text_process(text):
    '''
    Takes in a string of text, then performs the following
    1. Tokenizes and removes punctuation
    2. Removes stopwords
    3. Stems
    4. Returns a list of the cleaned text
    '''
    if(pd.isnull(text)):
        return []
    
    # Tokenize 
    tokenizer = RegexpTokenizer(r'\w+')
    text_processed = tokenizer.tokenize(text)
    
    # Removing any stopwords
    text_processed = [word.lower() for word in text_processed if word.lower() not in stopwords.words('english')]
    
    # Stemming
    porterStemmer = PorterStemmer()
    
    text_processed = [porterStemmer.stem(word) for word in text_processed]
    
    try:
        text_processed.remove('b')
        
    except:
        pass
    
    return " ".join(text_processed)
开发者ID:shankarchari,项目名称:data_science,代码行数:30,代码来源:process_data.py


示例7: trainMarkovChain

    def trainMarkovChain(self, n = 1):

        self.ngram_degree = n
      
        self.markov_model = defaultdict(lambda : defaultdict(int))

        sentences = self.corpus_sentences
        if sentences is None:
            sentences = self.sentenceTokenizeCorpus()

        print("Training markov model on corpus.")

        word_tokenizer = RegexpTokenizer(r"\w+")

        for sentence in sentences:
            words = word_tokenizer.tokenize(sentence)
            last_word_list = ["#"] * n

            for word in words:
                last_token = " ".join(last_word_list)
                
                self.markov_model[last_token][word] += 1
                
                last_word_list.append(word)
                last_word_list = last_word_list[1:]

            last_token = " ".join(last_word_list)
            self.markov_model[last_token]["#"] += 1
开发者ID:iangonzalez,项目名称:NaNoGenMo,代码行数:28,代码来源:NaNoGenMo.py


示例8: __init__

	def __init__(self, oldid, newid, data, general):
		self.newid=newid
		self.oldid=oldid
		self.data=data
		self.tfidfatt=[]
		self.tfidfval=[]
		self.freatt=[]
		self.freval=[]
		self.text=''
		self.ntlk=[]
		self.idfvalue=[]
		self.general=general

		tokenizer = RegexpTokenizer(r'\w+')
		#stemmer = SnowballStemmer("english")
		stemmer = PorterStemmer()

		stop = stopwords.words('english')
		for r in tokenizer.tokenize(data):
			a=0
			if r not in stop:
				if not any(i.isdigit() for i in r):
					r = stemmer.stem(r)
					if r not in self.ntlk:
						self.ntlk.append(r)
						self.text=self.text+' '+r
开发者ID:t1mch0w,项目名称:CSE5243,代码行数:26,代码来源:reuter.py


示例9: mean_stdDeviation

	def mean_stdDeviation(self,query,stopWordInstruction):
		list_count_postTitles = []
		list_postTitles = self.data[:][query].tolist()
		tokenizer = RegexpTokenizer(r'\w+')

		stopwords_mine = []
		#a.encode('ascii','ignore')
		stopwords_mine+= (word.encode('ascii','ignore') for word in stopwords.words('english'))
		tokenized_list = []
		new_list_tokenized = []
		for item in list_postTitles:
			tokenized_list.append(tokenizer.tokenize(item))
		
		if stopWordInstruction==True:
			for item in tokenized_list:
				temp = []
				temp += (word for word in item if word.lower() not in stopwords_mine)
				#print temp
				#raw_input()
				new_list_tokenized.append(temp)
		else:
			new_list_tokenized=copy.deepcopy(tokenized_list)
		


		for x in new_list_tokenized:
			list_count_postTitles.append(len(x))
		#print list_count_postTitles
		npArray = np.asarray(list_count_postTitles)
		print npArray.mean()
		print npArray.std()
		return [npArray.mean(),npArray.std(),list_postTitles,list_count_postTitles]
开发者ID:akshay0193,项目名称:SAH,代码行数:32,代码来源:assignment1.py


示例10: issue_analysis

def issue_analysis(df):
    df_sub = df[['Issue']]
    df_sub.insert(0, 'count', 1)

    Issue_List=[]
    for i in range(0,50):
        Issue_List.append(df_sub.groupby(['Issue']).sum().sort_index(by='count', ascending=False).ix[i].name)

    tokenizer = RegexpTokenizer(r'[A-Za-z0-9\']+')    # set tokenize Reg
    en_stop = get_stop_words('en')         # create English stop words list
    p_stemmer = PorterStemmer()            # Create p_stemmer of class PorterStemmer
    texts = []                             # list for tokenized documents in loop
    text_view = ''
                                                                
    # loop through document list
    for i in Issue_List:
        # clean and tokenize document string
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
       
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in en_stop]
        
        # stem tokens and add them to list
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        texts.append(stemmed_tokens)

        #print ' '.join(stemmed_tokens)
        text_view += ' '.join(stemmed_tokens)
        text_view += ' '

    wordcloud = WordCloud().generate(text_view)
    fig = plt.figure(figsize=(8,6))
    fig1 = fig.add_subplot(1,1,1)
    fig1.set_title("Top issued words", fontdict={'fontsize':25})
    fig1.imshow(wordcloud)
    fig1.axis("off")
    #plt.savefig('ComplainCount_WC.png')
    plt.savefig('ComplainCount_WC_2016.png')
    
    # turn our tokenized documents into a id <-> term dictionary
    dictionary = corpora.Dictionary(texts)

    # convert tokenized documents into a document-term matrix
    corpus = [dictionary.doc2bow(text) for text in texts]

    # generate LDA model
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=25, id2word = dictionary)
    LDAText =  ldamodel.print_topics(num_topics=5, num_words=3)
    #print "\n Topic analysis result for top 25 issues with LDA"
    #print(LDAText)
       
    vis_data = gensimvis.prepare(ldamodel, corpus, dictionary)
    #pyLDAvis.show(vis_data)
    #pyLDAvis.save_html(vis_data, "issue_lda.html")
    #pyLDAvis.save_json(vis_data, "issue_lda.json")
    pyLDAvis.save_html(vis_data, "issue_lda_2016.html")
    pyLDAvis.save_json(vis_data, "issue_lda_2016.json")

    return 0
开发者ID:choi-junhwan,项目名称:ConsumerComplaintsDataProject,代码行数:60,代码来源:Complaints_TextAnalysis.py


示例11: stripped_words

 def stripped_words(self, original_sentence):
     _sentence = filter(self.printable_char_filter, original_sentence)
     _sentence = _sentence.replace(u'\u2013', ' ')
     _sentence = _sentence.replace(u'\u2014', ' ')
     tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
     tokens = tokenizer.tokenize(_sentence)
     return [word.lower() for word in tokens if word.lower() not in stop_words]
开发者ID:bdenglish,项目名称:article_summarizer,代码行数:7,代码来源:article_summarizer.py


示例12: relevance_features

def relevance_features(doc):
	print "relfeatures"
	print doc[:10]
	features={}
	#print doc
	#Test 1 : Has synonyms of  NIT Warangal
	features['contains synonym']='false'
	for word in synonyms:
		if word in doc:
			features['contains synonym']='true'
			break

	#Test 2 : Has a person name that appears in Almabase's DB
	count=0
	names=ner.get_names(data)
	count=ner.query_db(names)
	print 'count is {}'.format(count)

	# if count==0:
	# 	features['hasAlumnus']='none'
	# elif count<=3:
	# 	features['hasAlumnus']='medium'
	# elif count>3:
	# 	features['hasAlumnus']='high'
	# print count

	#Test 3: Bag of words approach
	tokenizer = RegexpTokenizer(r'\w+')
	document_words=tokenizer.tokenize(doc)
	for word in word_features:
		if word.lower() in document_words:
			print "{} is present".format(word)
		features['contains({})'.format(word.lower())] = (word in document_words)
	return features
开发者ID:Pr1yanka,项目名称:Smart-News-Scraper-1,代码行数:34,代码来源:classifier.py


示例13: preprocess_wikidata

def preprocess_wikidata(raw):
 # Initialize Tokenizer
    tokenizer = RegexpTokenizer(r'\w+')

    # Initialize Lemmatizer
    lemma = WordNetLemmatizer()

    # create English stop words list
    en_stop = get_stop_words('en')

    # Decode Wiki Markup entities and remove markup
    text = filter_wiki(raw)
    text = re.sub(filter_more, '', text)

    # clean and tokenize document string
    text = text.lower().split('../img/')[0]
    tokens = tokenizer.tokenize(text)

    # remove stop words from tokens
    tokens = [i for i in tokens if not i in en_stop]

    # stem tokens
    tokens = [lemma.lemmatize(i) for i in tokens]

    # remove non alphabetic characters
    tokens = [re.sub(r'[^a-z]', '', i) for i in tokens]

    # remove unigrams and bigrams
    tokens = [i for i in tokens if len(i)>2]

    return (tokens, text)
开发者ID:DailyActie,项目名称:AI_APP_CV-TextTopicNet,代码行数:31,代码来源:preprocess_text.py


示例14: get_product_vocab

def get_product_vocab(dict_queries):
    tok = RegexpTokenizer(r'\w+')
    vocab = {}

    for query,v in dict_queries.items():
        words = defaultdict(int)

        for prod in v:
            w_prod = tok.tokenize(prod[1])
            for w in w_prod:
                #wt = stem(wt)
                if not re.match(r'\d+$', w) and \
                    len(w) > 1 and \
                    w not in stop_words: 
                    words[w] += 1

        vocab[query] = words.keys()
        #vocab[query] = [k for (k, v) in words.iteritems() if v > 1]

        """
        print "Query: " + query
        sorted_w = sorted(words.items(), key=lambda x:x[1], reverse=True)
        print sorted_w
        """
    
    return vocab
开发者ID:gbakie,项目名称:kaggle-cf-search,代码行数:26,代码来源:multi_svm_model.py


示例15: write_summary

def write_summary(texts, ofile):
    word_tokenizer = RegexpTokenizer(r"\w+")
    with codecs.open(ofile, u"w", u"utf-8") as f:
        for text in texts:
            f.write(u" ".join([w.lower() for w in word_tokenizer.tokenize(text)]))
            f.write(u"\n")
            f.flush()
开发者ID:kedz,项目名称:cuttsum,代码行数:7,代码来源:build_model_summaries.py


示例16: count_ngrams

def count_ngrams(sessions,length):
    data = sessions
    data = data.replace(',',' ')
    

    tokenizer = RegexpTokenizer("[0-9]+")
    #include only number (pageIDs) for tokens
    token = tokenizer.tokenize(data)
    from nltk.util import ngrams
    #print list(ngrams(token, 2))

    generated_ngrams = list(ngrams(token,length))
    #print generated_ngrams
    try:
        ngrams = ' '.join(generated_ngrams[0])
    except IndexError:
        global non_list 
        non_list += 1
        #print 'Failed generated ngrams as there is no minimum '    
   # print ngrams
 
    for ngram in generated_ngrams:
        if not ngrams_statistics.has_key(ngram):
            ngrams_statistics.update({ngram:1})
        else:
            ngram_occurrences = ngrams_statistics[ngram]
            ngrams_statistics.update({ngram:ngram_occurrences+1})      
开发者ID:Madhuka,项目名称:episode-mining,代码行数:27,代码来源:list_page_sequences.py


示例17: preprocess

def preprocess(sentence):
    sentence = sentence.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(sentence)
    filtered_words = [w for w in tokens if not w in stopwords.words('english')]
    #filtered_words = filter(lambda token: token not in stopwords.words('english'))
    return " ".join(filtered_words)
开发者ID:Yelrose,项目名称:liblinear_20NewsGroup,代码行数:7,代码来源:pipline.py


示例18: run

    def run(self):
        """
        How do I run this Task?
        Luigi will call this method if the Task needs to be run.
        """
        # remove stop words and punctuation
        stop = set(stopwords.words('english'))
        tokenizer = RegexpTokenizer(r'\w+')
        wordnet = WordNetLemmatizer()

        docs = []

        #ipdb.set_trace()

        for f in self.input(): # The input() method is a wrapper around requires() that returns Target objects
            lines = 0
            words = []

            for line in f.open('r'):
                if lines == 0:
                    label = line
                    lines +=1
                else:
                    words.extend(tokenizer.tokenize(line))
                    lines +=1

            words_filtered = filtered_words = [wordnet.lemmatize(w) for w in words if not w in stopwords.words('english')]
            docs.append((label, '\t'.join(words)))

        out = self.output().open('w')
        for label, tokens in docs:
            out.write("%s,%s\n" % (label.strip(), tokens.strip()))
        out.close()
开发者ID:DATAQC,项目名称:data-engineering-101,代码行数:33,代码来源:ml-pipeline.py


示例19: run

 def run(self, data):
     results = []
     tokenizer = RegexpTokenizer(r'((?<=[^\w\s])\w(?=[^\w\s])|(\W))+', gaps=True)
     for corpus in data:
         corpus.contents = " ".join(tokenizer.tokenize(corpus.contents))
         results.append(corpus)
     return results
开发者ID:kmp3325,项目名称:linguine-python,代码行数:7,代码来源:remove_punct.py


示例20: lemmatizeall

def lemmatizeall(word_list):
  """ Lemmatizes the word_list passing through each type of word

  Input: 
    word_list - list of words to be cleaned
    
    pos options: ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'
  """
  word_types = "v", "a", "n", "s", "r"
  #print(word_types)
  #ipdb.set_trace() 
  wnl = nltk.WordNetLemmatizer()
  
  tokenizer = RegexpTokenizer(r'\w+')
  for x in range(0, len(word_list)):   
      
      word_tokens = tokenizer.tokenize(str(word_list[x]))
      word_tokens_lem = word_tokens
      for i in range(0, len(word_types)):
      
          pos = word_types[i]      
          word_tokens_lem = [wnl.lemmatize(w, pos=pos) for w in word_tokens_lem]
          
      sep = " "
      word_list[x] = sep.join(word_tokens_lem)
   
          #print(i)
  return word_list #[wnl.lemmatize(w, pos=pos) for w in word_list]  
开发者ID:AnnaMag,项目名称:ncvo-s2ds-2015,代码行数:28,代码来源:text_processing.py



注:本文中的nltk.tokenize.RegexpTokenizer类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python tokenize.TreebankWordTokenizer类代码示例发布时间:2022-05-27
下一篇:
Python tokenize.PunktSentenceTokenizer类代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap