• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

Python nltk.FreqDist类代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中nltk.FreqDist的典型用法代码示例。如果您正苦于以下问题:Python FreqDist类的具体用法?Python FreqDist怎么用?Python FreqDist使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。



在下文中一共展示了FreqDist类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: top_words_from_corpus

 def top_words_from_corpus(self, num_words, test_name):
     corpus_tokens = []
     for i in self.corpus_vars["corpus_member_ids"]:
         title = 'document_' + str(i)
         doc_tokens = Library.document_instances[title].metadata["tokenized_doc"]
         corpus_tokens += doc_tokens
     top_words = []
     fdist_corpus = FreqDist(corpus_tokens)
     fdist_list = fdist_corpus.items()
     if test_name == "Function Word PCA":
         function_pos = ['IN', 'TO', 'CC', 'DT', 'PDT', 'WDT']
         for i in fdist_list:
             top_words.append(i[0])
             if len(top_words) == num_words:
                 tagged_top = nltk.pos_tag(top_words)
                 for j,k in tagged_top:
                     if k not in function_pos:
                         top_words.remove(j)
                 if len(top_words) == num_words:
                     break
     elif test_name == "Burrows's Delta":
         for i in fdist_list:
             top_words.append(i[0])
             if len(top_words) == num_words:
                 break
     return top_words
开发者ID:mjlavin80,项目名称:py_style,代码行数:26,代码来源:py_styleModel.py


示例2: get_hosts

def get_hosts(year):
    '''Hosts is a list of one or more strings. Do NOT change the name
    of this function or what it returns.'''
    # Your code here
    file_name = 'gg%s.json' % year
    with open(file_name, 'r') as data:
        db = json.load(data)
    hosts = []
    pairs = []
    for f in db:
        e = f['text']
        if 'and' in e.lower():
            for proper in strip_proper_pairs(normalize_str(e).split()):
                pair = proper.split('and')
                if len(pair) == 2:
                    if pair[0] != ' ' and pair[1] != ' ':
                        pairs.append((pair[0].lower().replace('\'','\"').strip(' '), pair[1].lower().replace('\'','\"').strip(' ')))
    pairs_freq = FreqDist(pairs)
    if len(pairs_freq.most_common(10)[0][0][0].split(' ')) < 2:
        hosts.append(pairs_freq.most_common(10)[1][0][0])
        hosts.append(pairs_freq.most_common(10)[1][0][1])
    else:
        hosts.append(pairs_freq.most_common(10)[0][0][0])
        hosts.append(pairs_freq.most_common(10)[0][0][1])
    return hosts
开发者ID:GregoryElliott,项目名称:TGMA_NLP_Project,代码行数:25,代码来源:gg_api.py


示例3: make_cutOff

def make_cutOff(flatList, bottomCutOff, topCutOff):
    '''
    INPUT:
    flatList is a 1-d list of all tokens in set of tweets and both bottom and
    topCutOff are intergers
    OUTPUT:
    newVocab = a 1-d list of all tokens we want to keep
    thrownOut = a 1-d list of all tokens to throw out
    '''
    fd = FreqDist(flatList)
    newVocab = []
    thrownOut = []
    
    for item in fd.items()[:topCutOff]:
        # append most common words
        thrownOut.append(item)

    for item in fd.items()[topCutOff:]:
        if item[1] > bottomCutOff:
            # append good words
            newVocab.append(item[0])
        else:
            # append uncommon words
            thrownOut.append(item)

    print 'Cutoffs made...'
    return newVocab, thrownOut
开发者ID:JRMeyer,项目名称:twitter,代码行数:27,代码来源:twitter_lda.py


示例4: main

def main():
    keyword_list = ["Top Secret", "Secret Service", "Classified", "Targeted", "Assassination",
                    "Kill Program", "NSA", "wire", "CIA", "FBI", "DEA", "DOJ", "hackers",
                    "hacker", "exploit code", "Defense", "Intelligence", "Agency"]
    file_name = "tweets_output.txt"
    pickle_words_file = "words.pickle"
    pickle_words(file_name, pickle_words_file, keyword_list)
    pickle_tweets_file = "tweets.pickle"
    pickle_tweets(file_name, pickle_tweets_file)
    words = load(open("words.pickle"))
    tweets = load(open("tweets.pickle"))
    freq_dist = FreqDist(words)
    print tweets
    print("===")
    print("Conducting Frequency and Lexical Diversity Analysis of Twitter Search Space: ")
    print("===")
    print("Number of words within the twitter search space: ")
    print(len(words))
    print("Number of unique words within twitter search space: ")
    print(len(set(words)))
    print("Lexical Diversity of unique words within twitter search space: ")
    print(lexical_diversity(words))
    print("===")
    print("Conducting Native Language Processing Analysis Utilizing Python NLTK")
    print("===")
    print("Top 50 Frequent Words within the Twitter Search Space: ")
    print(freq_dist.keys()[:50])
    print("===")
    print("Bottom 50 Frequent Words within the Twitter Search Space: ")
    print(freq_dist.keys()[-50:])
    print("===")
开发者ID:0day1day,项目名称:OSINT,代码行数:31,代码来源:nltk_tweet_analysis.py


示例5: get_word_features

def get_word_features(wordlist):

    wordlist = FreqDist(wordlist)

    word_features = wordlist.keys()

    return word_features
开发者ID:toshi09,项目名称:UserProfilingInSocialMedia,代码行数:7,代码来源:naive_bayes_nltk.py


示例6: __init__

  def __init__(self, num_topics, alpha_topic = 1.0, alpha_word = 1.0, 
               max_tables = 50000, sanity_check=False, initialize=False,
               report_filename="topic_history.txt"):

    self.max_tables = max_tables
    self._alphabet = FreqDist()
    # store all words seen in a list so they are associated with a unique ID.

    self.initialize_index()

    self._words = FreqDist()

    self.alpha_topic = alpha_topic
    self.alpha_word = alpha_word

    self._num_updates = 0
    self._report = None
    if report_filename:
        self._report = open(report_filename, 'w')

    self.num_topics = num_topics
    self._topics = [FreqDist() for x in xrange(num_topics)]

    # the sanity_check flag is for testing only. 
    if initialize and sanity_check == True:
        self.deterministic_seed()
    elif initialize:
        self.initialize_topics()
开发者ID:Mondego,项目名称:pyreco,代码行数:28,代码来源:allPythonContent.py


示例7: __init__

class Index:
    """
    The Index class stores an index for a document.
    """
    def __init__(self):
        self._freq_dist = None
        self._document = None

    def index(self, document):
        self._document = document
        if self._freq_dist == None:
            self._freq_dist = FreqDist()
            for term in self.terms():
                self._freq_dist.inc(term)

    def reset(self):
        "Reset the index"
        self._freq_dist = None

    def freq_dist(self):
        if self._freq_dist == None:
            self.index()
        return self._freq_dist

    # return the number of times a term appears in this document
    def freq(self, term):
        if not self._freq_dist:
            self.index()
        return self._freq_dist[term]

    def tf(self, term):
        if not self._freq_dist:
            self.index()
        return float(self._freq_dist[term]) / float(self._freq_dist.N())
开发者ID:jgerrish,项目名称:nltk_ext,代码行数:34,代码来源:index.py


示例8: create_word_freq

def create_word_freq(db):
  db = getattr(db, "Posts")
  #client.command("CREATE CLASS concepted EXTENDS E")

  client.command("DELETE EDGE concepted")
  #client.command('create property frequency.freq string')

  #client.command("DELETE VERTEX frequency")
  data =  db.find().batch_size(50)
  concept = client.command("SELECT name FROM concept")
  c = [c.name for c in concept]
  for d in data:
    if not 'Body' in d:
        display= ''
    else:
        display= cleanhtml(d['Body'].replace('\n', ' ').replace('\r', '').replace('\\', ''))
        tokens = nltk.word_tokenize(display)
        fdist=FreqDist(tokens)
        i  = fdist.most_common()
        for k in i:
          if k[0].lower() in c:
            try:
                client.command("CREATE EDGE concepted FROM (SELECT FROM concept WHERE name = '{0}') TO (SELECT FROM Content WHERE PostId = {1}) SET strength = {2}".format(k[0].lower(),d['_id'],k[1]))
            except:
              continue
开发者ID:Wehrlem,项目名称:flaskWebIn,代码行数:25,代码来源:stackToOrient.py


示例9: process

def process(f, return_tokens=True, return_freqdist=True):
    """
    Function to process deals data.
    Splits text into sentences. FreqDist is incremented from tokenization.
    Using PunktWordTokenizer, since it is a decent regexp-based tokenizer.
    Deals are also about domain names. Not intending to split it up

    :rtype : FreqDist, list() of str
    :param f: Input file with a deal per line
    """
    fd = FreqDist()
    tokens = []
    fh = open(f, 'r')
    sentences = [line.strip() for line in fh.readlines()]
    for line in sentences:
        t = []
        for word in PunktWordTokenizer().tokenize(line.lower()):
            if word not in set(stopwords.words('english')) and word not in set(string.punctuation):
                if return_tokens:
                    t.append(word)
                if return_freqdist:
                    fd.inc(word)
        tokens.append(t)
    fh.close()
    return fd, sentences, tokens
开发者ID:ypandit,项目名称:exercises,代码行数:25,代码来源:task1.py


示例10: process_tweets

def process_tweets (hashtag,addl_stops=[]):
    count=0
    good_count=0
    words_to_plot=[]
    #Iterate through all chunked files with relevant hashtag
    for fname in os.listdir(os.getcwd()):
        if fname.startswith(hashtag):
            with open(fname,'r') as data_file:
                data=data_file.read()
                # Parse raw string since json.load() approach wasn't working
                data=data.split("\n\x00,")
            for tweet in data:
                count+=1
        
                # Tweets have a well-defined structure, so we can parse them 
                # manually (even though the JSON approach would be cleaner)
                text=tweet[tweet.find("text\":")+7:tweet.find(",\"source\"")-1]
                
                # Skip tweets that contain Unicode
                if text.find('\u')>=0:
                    continue
                else:
                    good_count+=1
                    # Tokenize and count word frequency, ignoring case
                    words = word_tokenize(text)
                    clean_words= [w.lower() for w in words if not w.lower() in set(stops+addl_stops)]
                    words_to_plot=words_to_plot+clean_words             
    
    #Create frequency histogram of 50 most common words and print summary of activity 
    fdist=FreqDist(words_to_plot)
    fdist.plot(50)
    print "for "+hashtag+' we collected '+str(count)+' tweets out of which '+str(good_count)+" will be analyzed"
    return words_to_plot
开发者ID:lanorzi,项目名称:MIDS-W205_A2-1,代码行数:33,代码来源:create_histograms.py


示例11: featureset

def featureset(sample):
  comment, label = sample
  features = {}
#  tags = map(lambda statement: map(lambda (w,t):t, statement), comment)
  words = map(lambda statement: map(lambda (w,t):w, statement), comment)
  words = sum(words, [])
#  tags = sum(tags, [])
  size_= sum([len(word) for word in words])
  features['stmt_len'] = len(words)/float(len(comment))
  features['word_len'] = size_/float(len(words))
  features['size'] = size_
#  tags_dist = FreqDist(sum(tags, []))
#  for tag in TAGS:
#    features[tag] = tags_dist.get(tag, 0)
  dist = FreqDist([word.lower() for word in words])
#  num_stop_words = float(sum([dist.get(word, 0) for word in EN_STOPWORDS]))
#  features['prob_stop_words'] = num_stop_words/len(words)
  for word in EN_STOPWORDS:
    features[word] = dist.get(word, 0)/float(len(words))
  features['alwayson'] = 1.0
  for language in LANGUAGES:
    for i in range(1,n+1):
      word_sim, tag_sim, char_sim, w_s_sim = comment_similarity(GRAMS[language], comment, i)
      features['w_sim_%d_%s' % (i, language)] = word_sim
      features['t_sim_%d_%s' % (i, language)] = tag_sim
      features['c_sim_%d_%s' % (i, language)] = char_sim
#     features['s_sim_%d_%s' % (i, language)] = w_s_sim
  return (features, label)
开发者ID:aboSamoor,项目名称:NLP,代码行数:28,代码来源:rami_learning.py


示例12: posAnalysis

def posAnalysis(collection):

	reviews = collection.find(timeout=False)

	__reportProgress.counter = 0

	skip = 1

	for rev in reviews:
		if skip%200 == 0:
			print 'skip'+str(skip)
		__reportProgress()
		if rev.has_key('tags'):
			skip += 1
			if rev['tags'].has_key('NN'):				
				continue

		sents = sent_tokenize(rev['text'])
		tokens = [word for sent in sents for word in word_tokenize(sent)]
		pos = tagger.tag([tok for tok in tokens if tok not in ',.-$\" '])
		tag_fd = FreqDist(tag for (word, tag) in pos)
		tags = dict()
		for (key,value) in tag_fd.items():
			k = key.replace('$','S')
			out = key.translate(string.maketrans("",""), string.punctuation)
			if len(out)>0:
				tags[k] = value
		collection.update({'_id':rev['_id']},{"$set": {"tags": tags}})		
开发者ID:ecsark,项目名称:Yelp-Recruiting,代码行数:28,代码来源:trueRating.py


示例13: getTopNFreqWords

def getTopNFreqWords(textArr,N):
    fdist = FreqDist(textArr)
    topWordsWithFreq = fdist.most_common(N)
    topWords=[]
    for word in topWordsWithFreq:
        topWords.append(word[0])
    return topWords
开发者ID:akhilarora,项目名称:intelliad,代码行数:7,代码来源:NaiveBayesClassifierBulk-USER.py


示例14: category_by_movie

def category_by_movie():
    from nltk.corpus import movie_reviews as mr
    from nltk import FreqDist
    from nltk import NaiveBayesClassifier
    from nltk import classify
    from nltk.corpus import names
    from nltk.classify import apply_features
    import random

    documents = [(list(mr.words(f)), c) for c in mr.categories() for f in
mr.fileids(c)]
    random.shuffle(documents)

    all_words = FreqDist(w.lower() for w in mr.words())
    word_features = all_words.keys()[:2000]

    def document_features(document):
        document_words = set(document)
        features = {}
        for word in word_features:
            features['contains(%s)' % word] = (word in document_words)
        return features

    #print document_features(mr.words('pos/cv957_8737.txt'))
    #print documents[0]

    features = [(document_features(d), c) for (d, c) in documents]
    train_set, test_set = features[100:], features[:100]
    classifier = NaiveBayesClassifier.train(train_set)
    print classify.accuracy(classifier, train_set)
开发者ID:brenden17,项目名称:infinity,代码行数:30,代码来源:category_nltk.py


示例15: cleaner

def cleaner(filename):
	textfile = open(os.path.join(app.config['UPLOAD_FOLDER'], filename),'r')
	text = []
	all_dates = []
	complete_text = []
	words_list = []
	nodes = []
	for line in textfile:
		datetime,chat = line.split('-')
		date, time = datetime.split(',')
		loc = chat.find(':')

		#if len(chat.split(':'))==3:
		#	print chat
		user,text = chat[:loc],chat[loc+2:]
		text = text.replace("\n",'')
		words = text.split(' ')
		for i in words:
			words_list.append(i)
		complete_text.append(text)
		nodes.append(user)
		all_dates.append(date)

	#print set(nodes)
	#print set(all_dates)
	fdist = FreqDist(words_list)
	f1 = fdist.most_common(100)
	create_csv('wordcloud.csv',f1)
	textfile.close()
开发者ID:sehgalvibhor,项目名称:Whatsapp-ening,代码行数:29,代码来源:flaskr.py


示例16: bag_of_words

def bag_of_words(data, label_codebook, feature_codebook, theta):
    """"""
    word_dict = Alphabet()
    stopset = set(stopwords.words('english'))
    for key, value in data.items():
        label_codebook.add(key)
        for doc in value:
            doc_tokens = set(nltk.regexp_tokenize(doc, pattern="\w+"))
            for word in doc_tokens:
                if word not in stopset:
                    word_dict.add(word)
                    
    all_words = word_dict._label_to_index.keys()
    fdict = FreqDist([w for w in all_words])
    word_feature = fdict.keys()[theta:]
    for word in all_words:
        if word in word_feature:
            feature_codebook.add(word)
    
    instance_list = {}
    for label, document_list in data.items():
        instance_list[label] = []
        for document in document_list:
            vector = np.zeros(feature_codebook.size())
            tokens = set(nltk.regexp_tokenize(document, pattern="\w+"))
            indice = 0
            
            for word in tokens:
                if feature_codebook.has_label(word):
                    indice = feature_codebook.get_index(word)
                    vector[indice] = 1.
            instance_list[label].append(vector)
    return instance_list
开发者ID:Juicechuan,项目名称:workspace,代码行数:33,代码来源:naive_bayes.py


示例17: palavrasChaves

    def palavrasChaves(self):
        # fun��o da NLTK que retorna as stopwords na lingua inglesa
        stopE = stopwords.words('english')

        # fun��o da NLTK que retorna as stopwords na lingua portuguesa
        stop = stopwords.words('portuguese')  
              
        stopS = stopwords.words('spanish')
        
        palavrasChaves = [] 
        textoArtigo = []
        
        #retira pontua��es do texto e divide o texto em palavras
        for i in self.titulo.lower().replace(',','').replace('.','').replace('-','').replace('(','').replace(')','').split():
            #retira as stopwords da lingua portuguesa do texto do artigo que est� sendo apresentado
            if i not in stop:
                #retira as stopwords da lingua inglesa do texto do artigo que est� sendo apresentado
                if i not in stopE:
                    #ignora palavras com menos de 3 caracteres. Isso � para tratar palavras, como por exemplo o verbo "�"
                    if i not in stopS:
                            if len(i) > 2:
                                textoArtigo.append(i)
        
        # apresenta a frequencia de repeticoes das palavras no corpo do artigo
        freq = FreqDist(textoArtigo)
        
        # separa as quatro palavras mais frequentes
        items = freq.items()[:4]
        
        # coloca as palavras mais frequentes do texto na variavel palavrasChaves
        for i in range(0,len(items)):
            palavrasChaves.append(items[i][0])
            
        return palavrasChaves        
开发者ID:dienerpiske,项目名称:QSabe,代码行数:34,代码来源:models.py


示例18: transmit_vocabulary

def transmit_vocabulary(t_token, t_lang):
    languages = ['danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian',
                 'norwegian', 'portuguese', 'russian', 'spanish', 'swedish', 'turkish']
    voc_stopwords = set()
    if t_lang in languages:
        voc_stopwords = set(stopwords.words(t_lang))
    i_f = codecs.open('csv/'+t_token+'.csv', 'r', 'utf-8')
    lines = i_f.readlines()
    all_tweets = []
    corpus_size = 0
    for line in lines:
        row = line.split('\t')
        words = word_tokenize(row[1])
        all_tweets.extend([w.lower() for w in words])
        corpus_size += 1
    freq_distribution = FreqDist(all_tweets)
    cats_vocabulary_elements = []
    for word, frequency in freq_distribution.most_common(1000):
        if word not in voc_stopwords:
            cats_vocabulary_elements.append('["' + word + '", ' + str(frequency) + ']')
    cats_vocabulary = '['+','.join(cats_vocabulary_elements)+']'
    print(cats_vocabulary)
    result_data = {'token': t_token, 'result': cats_vocabulary}
    json_data = json.dumps(result_data)
    results_request = urllib2.Request('http://mediamining.univ-lyon2.fr/cats/module/resultFile')
    results_request.add_header('Content-Type', 'application/json')
    results_request.data = json_data.encode('utf-8')
    urllib2.urlopen(results_request)
    print('Transmitted vocabulary for token '+t_token)
    os.remove('csv/' + t_token + '.csv')
开发者ID:CATS-Project,项目名称:CATS-TextMiningServices,代码行数:30,代码来源:run.py


示例19: find_names

 def find_names(self):
     """creates a frequency distribution of the
     most common names in the texts"""
     names_list = LIST_OF_NAMES
     name_tokens = [w for w in self.tokens if w in names_list]
     fd = FreqDist(name_tokens)
     return fd.most_common(50)
开发者ID:wludh,项目名称:frenchnewspapers,代码行数:7,代码来源:main.py


示例20: analyzeTitles

def analyzeTitles():
    fulltitles = []
    titles = []
    with open('../top100clean.csv', 'rb') as bookfile:
        reader = csv.reader(bookfile)
        for row in reader:
            if "..." in row[0]:
                row[0] = " ".join(row[0].split(" ")[:-1])
            words = nltk.word_tokenize(row[0])
            for w in words:
                if w.isalpha() and w.lower() not in ['the','a']:
                    titles.append(w.lower())
            fulltitles.append(row[0])

    titleset = nltk.Text(titles)
    wordsintitle = [len(f.split(" ")) for f in fulltitles]
    wit_fd = FreqDist(wordsintitle)
    print "\nw.i.t.\tfreq"
    print "--------------------"
    for numwords, times in wit_fd.iteritems():
        print str(numwords) + "\t" + str(times)
    print "\n"

    print "\nword\t\tfreq"
    print "--------------------"
    fd = FreqDist(titleset)
    common_words = fd.most_common(25)
    for k, v in common_words:
        print str(k) + "\t\t" + str(v)
开发者ID:nelsonam,项目名称:booklytics,代码行数:29,代码来源:analyze_titles.py



注:本文中的nltk.FreqDist类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python nltk.NaiveBayesClassifier类代码示例发布时间:2022-05-27
下一篇:
Python nltk.CFG类代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap