• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

Python nltk.trigrams函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中nltk.trigrams函数的典型用法代码示例。如果您正苦于以下问题:Python trigrams函数的具体用法?Python trigrams怎么用?Python trigrams使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了trigrams函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: compare_pos

def compare_pos(file_name_1, file_name_2):

    tokens_1 = make_tokens(file_name_1)
    tokens_2 = make_tokens(file_name_2)

    tri_tokens_1  = trigrams(tokens_1)
    tri_tokens_2  = trigrams(tokens_2)

    dist_1 = nltk.FreqDist(tri_tokens_1)
    dist_2 = nltk.FreqDist(tri_tokens_2)

    diff_1 = dist_1 - dist_2
    diff_2 = dist_2 - dist_1

    with open("common_pos_mt.txt", "w") as file:
        for word, freq in diff_1.most_common(20):
            line = str(word) + " " + str(freq) + '\n'
            print(line)
            file.write(line)

    with open("common_pos_hmn.txt", "w") as file:
        for word, freq in diff_2.most_common(20):
            line = str(word) + " " + str(freq) + '\n'
            print(line)
            file.write(line)


    """
开发者ID:NunoXu,项目名称:UnbabelChallenge2016,代码行数:28,代码来源:PosComparasion.py


示例2: train

    def train(self,tweets):
        # 1st step: build the bag-of-words model
        tweet_tokens_list = [tweet_tokens for tweet_tokens,label in tweets]
        tokens = []
        print('Computing the trainset vocabulary of n-grams')
        for tweet_tokens in tweet_tokens_list:
            unigrams = [w.lower() for w,t in tweet_tokens]
            tokens += unigrams
            tokens += ['_'.join(b) for b in bigrams(unigrams)]
            tokens += ['_'.join(t) for t in trigrams(unigrams)]
            tokens += [t1 + '_*_' + t3 for t1,t2,t3 in trigrams(unigrams)]

        # build the bag-of-words list using all the tokens
        self.bag_of_words = set(tokens)

        data = list()
        total_tweets = len(tweets)
        features_list = list()
        for index,(tweet_tokens,label) in enumerate(tweets):
            print('Training for tweet n. {}/{}'.format(index+1,total_tweets))
            features_list.append(self.extract_features(tweet_tokens))

        # Train a SVM classifier
        #data = self.vectorizer.fit_transform([features for features,label in self.train_set_features])
        print('Vectorizing the features')
        data = self.vectorizer.fit_transform(features_list)
        target = self.encoder.fit_transform([label for tweet_tokens,label in tweets])
        print('Building the model')
        self.classifier.fit(data, target)
开发者ID:pdsujnow,项目名称:EmotionTweetClassifier_3412260,代码行数:29,代码来源:MachineLearningClassifier.py


示例3: main

def main():
	text = open('holmes.txt').read()
	tokens = nltk.wordpunct_tokenize(text)
	charList = []
	for word in tokens:
		for char in word:
			charList.append(char)
	fDistChars = nltk.FreqDist(charList)
	fDistWords = nltk.FreqDist(tokens)
	
	print("Answer to 1A, there are {} character types in the book, namely: \n{}".format(len(fDistChars),sorted(fDistChars)))
	print("\nAnswer to 1B, there are {} word types in the book, namely: \n{}".format(len(fDistWords),sorted(fDistWords)))
	
	bigramChars = nltk.bigrams(charList)
	trigramChars = nltk.trigrams(charList)

	print("\nAnswer to 1C, the 20 most common characters are: \nUnigrams: \n{}\nBigrams: \n{}\nTrigrams: \n{}".format(most_common(charList), 
		most_common(bigramChars), most_common(trigramChars)))

	bigramWords = nltk.bigrams(tokens)
	trigramWords = nltk.trigrams(tokens)

	print("\nAnswer to 1D, the 20 most common words are: \nUnigrams: \n{}\nBigrams: \n{}\nTrigrams: \n{}".format(most_common(tokens), 
		most_common(bigramWords), most_common(trigramWords)))
	
	bigram_measures = nltk.collocations.BigramAssocMeasures()
	finder = BigramCollocationFinder.from_words(tokens)
	scoredPMI = finder.score_ngrams(bigram_measures.pmi)
	scoredCHI = finder.score_ngrams(bigram_measures.chi_sq)
	
	print("\nAnswer to 2, the 20 most likely collocations are:\nPMI:\n{} \nChi's square\n{}" .format(scoredPMI[:20],scoredCHI[:20]))
	
	print("\nSpearmans correlation = {}".format(nltk.metrics.spearman.spearman_correlation(scoredPMI, scoredCHI)))
开发者ID:Martbov,项目名称:pta-group1,代码行数:33,代码来源:assignment1.py


示例4: extract_features

    def extract_features(self, tweet_tokens):

        if len(self.bag_of_words) == 0:
            print('Bag-of-Words empty!')

        unigrams = [w.lower() for w,t in tweet_tokens]
        tokens = unigrams
        tokens += ['_'.join(b) for b in bigrams(unigrams)]
        tokens += ['_'.join(t) for t in trigrams(unigrams)]
        tokens += [t1 + '_*_' + t3 for t1,t2,t3 in trigrams(unigrams)]

        tweet_tags =  [tag for token, tag in tweet_tokens]

        feature_set = {}

        # 1st set of features: bag-of-words
        for token in set(tokens).intersection(self.bag_of_words):
            feature_set['has_'+token] = True

        # 2nd set of features: the count for each tag type present in the message
        # Tweet_nlp taget. Info:
        # http://www.ark.cs.cmu.edu/TweetNLP/annot_guidelines.pdf
        for tag in ['N','O','^','S','Z','V','A','R','!','D','P','&','T','X','#','@','~','U','E','$',',','G','L','M','Y']:
            feature_set['num_'+tag] = sum([1 for t in tweet_tags if t == tag])

        # 3rd feature: negation is present?
        negators = set(LexiconClassifier().read_negation_words())
        if len(negators.intersection(set(tokens))) > 0:
            feature_set['has_negator'] = True

        # 4th feature: character ngrams
        regexp = re.compile(r"([a-z])\1{2,}")
        feature_set['has_char_ngrams'] = False
        for token,tag in tweet_tokens:
            if regexp.search(token):
                feature_set['has_char_ngrams'] = True
                break

        # 5th feature: punctuation ngrams
        regexp = re.compile(r"([!\?])\1{2,}")
        feature_set['has_punct_ngrams'] = False
        for token,tag in tweet_tokens:
            if regexp.search(token):
                feature_set['has_punct_ngrams'] = True
                break

        # 6th feature: the number of all upper cased words
        feature_set['num_all_caps'] = sum([1 for token,tag in tweet_tokens if token.isupper() and len(token)>=3])

        # 7th and 8th feature: the positive and negative score from lexicon
        # classifier (i.e., number of positive and negative words from lexicon)
        positive_score, negative_score = self.lexicon_classifier.classify(tweet_tokens)
        feature_set['pos_lexicon'] = positive_score
        feature_set['neg_lexicon'] = -1 * negative_score

        return feature_set
开发者ID:pdsujnow,项目名称:EmotionTweetClassifier_3412260,代码行数:56,代码来源:MachineLearningClassifier.py


示例5: jacquard_trigram

def jacquard_trigram(query):
    final=[]
    for a in file('enwiktionary.a.list'):
        a=a.rstrip()
        trigram=set(nltk.trigrams(a))
        q_trigram=set(nltk.trigrams(query))
        intersect=q_trigram.intersection(trigram)
        union=q_trigram.union(trigram)
        sim=float(len(intersect))/len(union)
        
        final.append([a,sim])
    final_sorted= sorted(final,key=lambda sim:sim[1], reverse=True)
    print final_sorted[:10]
开发者ID:jubimishra,项目名称:Data-Mining,代码行数:13,代码来源:jacquard_vs_levenshtein.py


示例6: main

def main():

    OUT = open("../output.txt", "w")
    OUT.close()
    INP = open("../data/test.hyp1-hyp2-ref", "r")
    inp = INP.read()
    for sent in inp.split("\n")[:-1]:
        h1 = sent.split(" ||| ")[0].split(" ")
        h2 = sent.split(" ||| ")[1].split(" ")
        ref = sent.split(" ||| ")[2].split(" ")
        h1p = process(h1)
        h2p = process(h2)
        refp = process(ref)
        #print(h1c, h2c, refc)
        #h1_match = word_matches(h1, rset)
        #h2_match = word_matches(h2, rset)
        h1c = Counter(h1)
        h2c = Counter(h2)
        refc = Counter(ref)
        h1_bigrams = nltk.bigrams(h1)
        h2_bigrams = nltk.bigrams(h2)
        ref_bigrams = nltk.bigrams(ref)
        h1_trigrams = nltk.trigrams(h1)
        h2_trigrams = nltk.trigrams(h2)
        ref_trigrams = nltk.trigrams(ref)
        #print(h_bigrams, ref_bigrams)
        h1_bigramsc = Counter(h1_bigrams)
        h2_bigramsc = Counter(h2_bigrams)
        ref_bigramsc = Counter(ref_bigrams)
        h1_trigramsc = Counter(h1_trigrams)
        h2_trigramsc = Counter(h2_trigrams)
        ref_trigramsc = Counter(ref_trigrams)
        h1_allc = h1c + h1_bigramsc + h1_trigramsc
        h2_allc = h2c + h2_bigramsc + h2_trigramsc
        ref_allc = refc + ref_bigramsc + ref_trigramsc
        h1_precision = precision(h1_allc, ref_allc)
        h2_precision = precision(h2_allc, ref_allc)
        h1_recall = recall(h1_allc, ref_allc)
        h2_recall = recall(h2_allc, ref_allc)
        h1_meteor = meteor(h1_precision, h1_recall)
        h2_meteor = meteor(h2_precision, h2_recall)
        OUT = open("../output.txt", "a")

        if h1_meteor > h2_meteor:
            OUT.write("-1\n")
        else:
            if h1_meteor == h2_meteor:
                OUT.write("0\n")
            else:
                OUT.write("1\n")
        OUT.close()
开发者ID:ssitaram,项目名称:sp2013.11-731,代码行数:51,代码来源:meteor.py


示例7: calc_trigrams

def calc_trigrams(brown_tags):
    #print brown_tags[0]
    #q_values = {}
    #unigram_c = collections.defaultdict(int)
    bigram_c = collections.defaultdict(int)
    trigram_c = collections.defaultdict(int)

    for stags in brown_tags:
        unigram_tuples = stags
        bigram_tuples =  list(nltk.bigrams(stags))
        trigram_tuples = list(nltk.trigrams(stags))


        #print unigram_tuples
        #for g in unigram_tuples:
            #unigram_c[g] += 1

        for g in bigram_tuples:
            bigram_c[g] += 1

        for g in trigram_tuples:
            trigram_c[g] += 1

    bigram_c[(START_SYMBOL, START_SYMBOL)] = len(brown_tags)
    q_values = {k: math.log(float(v) / bigram_c[k[:2]], 2) for k, v in trigram_c.iteritems()}

    return q_values
开发者ID:mothaibatacungmua,项目名称:AI-course,代码行数:27,代码来源:solutionsB.py


示例8: calc_probabilities

def calc_probabilities(training_corpus):
    unigram_p = {}
    bigram_p = {}
    trigram_p = {}
    total_unigram=0
    unigram_freq=Counter()
    bigram_freq=Counter()
    trigram_freq=Counter()
    u_freq=Counter()
    for line in training_corpus:
        line=START_SYMBOL+" "+ line+STOP_SYMBOL
        unigram_tokens=line.split()
        unigram_freq.update(unigram_tokens)
        total_unigram=total_unigram+len(unigram_tokens)
    for sent in training_corpus:
        sent=START_SYMBOL+" "+ START_SYMBOL+" "+sent+STOP_SYMBOL
        unigram_tokens=sent.split()
        u_freq.update(unigram_tokens)
        bigram_tuples=list(nltk.bigrams(unigram_tokens))
        bigram_freq.update(bigram_tuples)
        trigram_tuples=list(nltk.trigrams(unigram_tokens))
        trigram_freq.update(trigram_tuples)

    for key in unigram_freq:
        unigram_p[(key,)]= math.log(unigram_freq[key]/float(total_unigram),2)

    for key in bigram_freq:
        bigram_p[key]= math.log(bigram_freq[key]/float(u_freq[key[0]]),2)
    
    for key in trigram_freq:
        trigram_p[key]=math.log(trigram_freq[key]/float(bigram_freq[key[0],key[1]]),2)

    
    return unigram_p, bigram_p, trigram_p
开发者ID:jubimishra,项目名称:Natural-Language-Processing,代码行数:34,代码来源:sol.py


示例9: linearscore

def linearscore(unigrams, bigrams, trigrams, corpus):
    """Linear interpolate the probabilities.

    See http://web.stanford.edu/~jurafsky/slp3/4.pdf paragraph 4.4.3
    """
    scores = []
    # Set lambda equal to all the n-grams so that it sums up to 1.
    lambda_ = 1.0 / 3
    for sentence in corpus:
        interpolated_score = 0
        tokens0 = sentence.strip().split()
        for trigram in nltk.trigrams([START_SYMBOL] + [START_SYMBOL] + tokens0 + [STOP_SYMBOL]):
            try:
                p3 = trigrams[trigram]
            except KeyError:
                p3 = MINUS_INFINITY_SENTENCE_LOG_PROB
            try:
                p2 = bigrams[trigram[1:3]]
            except KeyError:
                p2 = MINUS_INFINITY_SENTENCE_LOG_PROB
            try:
                p1 = unigrams[trigram[2]]
            except KeyError:
                p1 = MINUS_INFINITY_SENTENCE_LOG_PROB
            interpolated_score += math.log(lambda_ * (2 ** p3) + lambda_ * (2 ** p2) + lambda_ * (2 ** p1), 2)
        scores.append(interpolated_score)
    return scores
开发者ID:mennanov,项目名称:nlp-coursera,代码行数:27,代码来源:solutionsA.py


示例10: ngramify

    def ngramify(self, word_list, stop):
        # creates an ngram from a word_list based on class settings
        mode = self.mode
        pos = self.inclued_pos
        word = self.include_word
        stopset = set(stopwords.words("english"))
        stopset.remove("not")
        if stop:
            if word and pos:
                selection = [(w.lower(), p) for w, p in word_list if w.lower() not in stopset]
            elif word:
                selection = [w.lower() for w, p in word_list if w.lower() not in stopset]
            elif pos:
                selection = [p for w, p in word_list if w.lower() not in stopset]
        else:
            if word and pos:
                selection = [(w.lower(), p) for w, p in word_list]
            elif word:
                selection = [w.lower() for w, p in word_list]
            elif pos:
                selection = [p for w, p in word_list]

        if mode == "unigrams":
            word_list = selection
        elif mode == "bigrams":
            word_list = nltk.bigrams(selection)
        elif mode == "trigrams":
            word_list = nltk.trigrams(selection)
        return word_list
开发者ID:sctennis77,项目名称:semeval,代码行数:29,代码来源:classify.py


示例11: exercise2

def exercise2(category):
    print
    print "For Category: " + category
    print "Part 1"
    print "Words with the tag 'JJ':"
    words = bn.tagged_words(categories = category)
    wordlist = bn.words(categories = category)
    words_JJ = set(sorted([(word, tag) for (word, tag) in words if tag == 'JJ']))
    print len(words_JJ)
    print
    print "Part 2"
    print "Words with tags 'VBZ' -> 3rd Person Singular Verbs or ('NNPS' or 'NNS') -> plural nouns:"
    words_VBP_NNPS_NNS = [(word, tag) for (word, tag) in words if tag == 'VBZ' or tag == 'NNPS' or tag == 'NNS']
    print words_VBP_NNPS_NNS[:10]
    print
    sent = ""
    print "Part 3"
    print "The 3 most frequent 3-word prepositional phrases are:"
    words = bn.tagged_words(categories = category)
    for (w1, t1), (w2, t2), (w3, t3) in nltk.trigrams(words):
        if(t1.startswith('IN') and t2.startswith('AT') and t3.startswith('NN')):
            sent = sent + w1.lower() + " " + w2.lower() + " " + w3.lower() + "."
    sent_part = sent.split(".")
    fd = nltk.FreqDist(sent_part)
    v = fd.most_common(3)
    print v
    print
    print "Part 4"
    print "Ratio of Masculine to Feminine is:"
    male_pattern = r'\bhe\b|\bhis\b|\bhim\b|\bhimself\b'
    female_pattern = r'\bshe\b|\bher\b|\bhers\b|\bherself\b'
    male_pronouns = len([w for w in wordlist if re.search(male_pattern, w.lower())])
    female_pronouns = len([w for w in wordlist if re.search(female_pattern, w.lower())])
    print "Male : Female is -> %d : %d" %(male_pronouns, female_pronouns)
    print
开发者ID:GirishSrinivas,项目名称:PythonPrograms,代码行数:35,代码来源:Girish_Srinivas_ch5a.py


示例12: calcSentProb

def calcSentProb(sent, NGramProbDict, n):
    '''
    Look up each tag-ngram (trigrams here) in the target sentence in the
    ngrams log-prob dictionary; if found, add log-prob to total, else use
    the default prob;
    '''
    prob    = 0.0
    count   = 0
    if len(sent)< 2:
        prob = -12
        count = 1
    elif len(sent)<3 or n==2:
        for (w1,t1),(w2,t2) in nltk.bigrams(sent):
            if (t1,t2) in NGramProbDict.keys():
                prob += NGramProbDict[(t1,t2)]
            else:
                prob += tri_default_prob
            count += 1
    elif n==3:
        for (w1,t1),(w2,t2),(w3,t3) in nltk.trigrams(sent):
            if (t1,t2,t3) in NGramProbDict.keys():
                prob += NGramProbDict[(t1,t2,t3)]
            else:
                prob += bi_default_prob
            count += 1
    return float(prob) / count
开发者ID:divanshugarg,项目名称:Kaggle-Projects-Stuff,代码行数:26,代码来源:grammarAnalNGrams.py


示例13: ngrams_freq

def ngrams_freq(tokens):
    trigrams  = nltk.trigrams(tokens)
    fdist = nltk.FreqDist(trigrams)
    dd = {}
    for k,v in fdist.items():
        dd[k] = v
    return dd
开发者ID:tomgond,项目名称:snipplets,代码行数:7,代码来源:utils.py


示例14: score

def score(ngram_p, n, data):
    scores = []
    if n == 1:
        for sentence in data:
            line_score = 0
            sentence += "STOP "
            unigram_tokens = nltk.word_tokenize(sentence)
            for token in unigram_tokens:
                line_score += ngram_p[(token,)]
            scores.append(line_score)
    elif n == 2:
        for sentence in data:
            line_score = 0
            sentence = "* " + sentence + "STOP "
            bigram_tuples = tuple(nltk.bigrams(nltk.word_tokenize(sentence)))
            for bigram in bigram_tuples:
                line_score += ngram_p[bigram]
            scores.append(line_score)
    elif n == 3:
        for sentence in data:
            line_score = 0
            sentence = "* * " + sentence + "STOP "
            trigra_tuples = tuple(nltk.trigrams(nltk.word_tokenize(sentence)))
            for trigram in trigra_tuples:
                line_score += ngram_p[trigram]
            scores.append(line_score)
    return scores
开发者ID:sunilitggu,项目名称:CS565,代码行数:27,代码来源:solutionsA.py


示例15: _count_words

def _count_words(path):
    print path

    word_count = defaultdict(int)

    with open(path, 'r') as f:
        tokens = nltk.word_tokenize(f.read().decode('utf-8').lower())

    word_counts = nltk.FreqDist(tokens)

    for word, count in word_counts.items():
        word_count[word] = count 
    
    bigrams = nltk.bigrams(tokens)
    bigram_counts = nltk.FreqDist(bigrams)

    for bigram, count in bigram_counts.items():
        word_count['%s %s' % bigram] = count

    trigrams = nltk.trigrams(tokens)
    trigram_counts = nltk.FreqDist(trigrams)

    for trigram, count in trigram_counts.items():
        word_count['%s %s %s' % trigram] = count

    filename = path.split('/')[2]
    count_date = '%s-%s-%s' % (filename.split('-')[0], filename.split('-')[1], filename.split('-')[2])

    with open('data/text/counts/%s.json' % count_date, 'w') as f:
        json.dump({ 'words': word_count }, f)
开发者ID:nprapps,项目名称:wh-press-briefings,代码行数:30,代码来源:data.py


示例16: get_trigrams

def get_trigrams(sentence, stopwords, porter):
  words = nltk.word_tokenize(sentence)
  words = [word.lower() for word in words]
  words = [normalize_numeric(word) for word in words]
  words = [normalize_stopword(word, stopwords) for word in words]
  words = [porter.stem(word) for word in words]
  return nltk.trigrams(words)
开发者ID:447327642,项目名称:nltk-examples,代码行数:7,代码来源:eval_model.py


示例17: ngramify

    def ngramify(self, word_list):
        """
            Tranforms word_list into unigrams, bigrams, trigrams

            input:
                list of words
        """

        # creates an ngram from a word_list based on class settings
        mode = self.mode
        pos = self.inclued_pos
        word = self.include_word
        if word and pos:
            selection = [(w.lower(), p) for w, p in word_list]
        elif word:
            selection = [w.lower() for w, p in word_list]
        elif pos:
            selection = [p for w, p in word_list]

        if mode == "unigrams":
            word_list = selection
        elif mode == "bigrams":
            word_list = nltk.bigrams(selection)
        elif mode == "trigrams":
            word_list = nltk.trigrams(selection)
        return word_list
开发者ID:samuelclark,项目名称:semeval2013,代码行数:26,代码来源:classify.py


示例18: demo_findPOSpattern

def demo_findPOSpattern(words_tagged, num=20):
  print "List the most {0} ambiguous words ...".format(num)
  i = 0
  data = nltk.ConditionalFreqDist(words_tagged)
  for word in data.conditions(): 
    if len(data[word]) > 3:
      i += 1
      tags = data[word].keys()
      print word.encode('big5'), "=>", ', '.join(tags)
      if i >= num: break
  while True:
    inp = raw_input("Enter a 3-frame pattern (example:'把 N V', 0 to exit): ")
    if inp == '0': break
    inp = inp.decode('big5')
    P = inp.split(' ')
    for (w1,t1), (w2,t2), (w3,t3) in nltk.trigrams(words_tagged):
      W = (w1, w2, w3); T = (t1, t2, t3); 
      flag = 0
      for i in range(len(W)):
      	if len(P[i]) == 0: break # if no input pattern then show dialog again
        if ord(P[i]) < 128: # an English tag name 
          if T[i].startswith(P[i]): flag += 1
        elif W[i] == P[i]: flag += 1
      if flag == len(W):
        print ', '.join(W)
开发者ID:dreampocketit,项目名称:bocard,代码行数:25,代码来源:NLTK_tools.py


示例19: calc_probabilities

def calc_probabilities(training_corpus):
    unigram_c = collections.defaultdict(int)
    bigram_c = collections.defaultdict(int)
    trigram_c = collections.defaultdict(int)

    for sentence in training_corpus:
        tokens0 = sentence.strip().split()
        tokens1 = tokens0 + [STOP_SYMBOL]
        tokens2 = [START_SYMBOL] + tokens0 + [STOP_SYMBOL]
        tokens3 = [START_SYMBOL] + [START_SYMBOL] + tokens0 + [STOP_SYMBOL]
        # unigrams
        for unigram in tokens1:
            unigram_c[unigram] += 1

        # bigrams
        for bigram in nltk.bigrams(tokens2):
            bigram_c[bigram] += 1

        # trigrams
        for trigram in nltk.trigrams(tokens3):
            trigram_c[trigram] += 1

    unigrams_len = sum(unigram_c.itervalues())
    unigram_p = {k: math.log(float(v) / unigrams_len, 2) for k, v in unigram_c.iteritems()}

    # calc P(W2|W1) = P(W2,W1) / P(W1) = C(W2,W1) / C(W1)
    unigram_c[START_SYMBOL] = len(training_corpus)
    bigram_p = {k: math.log(float(v) / unigram_c[k[0]], 2) for k, v in bigram_c.iteritems()}

    bigram_c[(START_SYMBOL, START_SYMBOL)] = len(training_corpus)
    trigram_p = {k: math.log(float(v) / bigram_c[k[:2]], 2) for k, v in trigram_c.iteritems()}
    return unigram_p, bigram_p, trigram_p
开发者ID:mennanov,项目名称:nlp-coursera,代码行数:32,代码来源:solutionsA.py


示例20: get_classification

	def get_classification(self, text):
		text = ut.clean(text)
	
		uni = nltk.tokenize.word_tokenize(text)
		
		bi = nltk.bigrams (uni)
		tri = nltk.trigrams (uni)
		
		temp_lambda = self.lambda_pi
		
		# Map to store answer to its divergence pairs
		list_of_ans = dict()
		
		for (ques, ans) in self.training_set:
			
			fin_val = 0.0
		
			for t in uni:
				fin_val += temp_lambda[5] * (float(self.unigram_tot_dict.get(t,0))/self.len)
				fin_val += temp_lambda[4] * (float(self.unigram_dict.get((ques,t),0))/len(ques))
			
			for t in bi:
				fin_val += temp_lambda[3] * (float(self.bigram_tot_dict.get(t,0))/self.unigram_tot_dict.get(t[:1],1))
				fin_val += temp_lambda[2] * (float(self.bigram_dict.get((ques,t),0))/self.unigram_dict.get((ques,t[:1]),1)) 
			
			for t in tri:
				fin_val += temp_lambda[1] * (float(self.trigram_tot_dict.get(t,0))/self.bigram_tot_dict.get(t[:2],1))
				fin_val += temp_lambda[0] * (float(self.trigram_dict.get((ques,t),0))/self.bigram_dict.get((ques,t[:2]),1))		
			
			list_of_ans[self.training_orig.get(ans, ans)] = fin_val
		
		# Return Weighted list of responses
		return list_of_ans
开发者ID:cloudbearings,项目名称:QuestCon,代码行数:33,代码来源:trainer_main.py



注:本文中的nltk.trigrams函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python nltk.word_tokenize函数代码示例发布时间:2022-05-27
下一篇:
Python nltk.sent_tokenize函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap