• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

Python nltk.bigrams函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中nltk.bigrams函数的典型用法代码示例。如果您正苦于以下问题:Python bigrams函数的具体用法?Python bigrams怎么用?Python bigrams使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了bigrams函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: freq_dst

    def freq_dst(self,posCorpus,negCorpus):
         
        #Creates frequency distribution for words in corpus
        posFreqDist = FreqDist()
        for word in posCorpus.words():
            posFreqDist.inc(word)

        negFreqDist = FreqDist()
        for word in negCorpus.words():
            negFreqDist.inc(word)
 
        #Frequency Distributions with Laplace Smoothing 
        global posLapFreq
        posLapFreq = nltk.probability.LaplaceProbDist(posFreqDist) 
        global negLapFreq
        negLapFreq = nltk.probability.LaplaceProbDist(negFreqDist)

        #GetBigrams
        posBigrams = nltk.bigrams(posCorpus.words())
        negBigrams = nltk.bigrams(negCorpus.words())

        #Get no. of words per corpus
        posWordLen = len(posCorpus.words())
        negWordLen = len(negCorpus.words())


        #FreqDist for Bigrams
        global posBiFreq
        posBiFreq = nltk.probability.LaplaceProbDist(nltk.FreqDist(posBigrams))
        global negBiFreq
        negBiFreq = nltk.probability.LaplaceProbDist(nltk.FreqDist(negBigrams))
开发者ID:abhinavmishra590,项目名称:Sentiment-Based-Document-Classification,代码行数:31,代码来源:Lang_Model.py


示例2: find_colloc

def find_colloc(data):  # find most common collocations
    def check(wb, tb):
        if len(wb[0]) <= 1 or len(wb[1]) <= 2:
            return False
        try:
            if detect(wb[0]) != "ar" or detect(wb[1]) != "ar":
                return False
        except:
            return False

        if tb in [("NN", "NN"), ("NN", "DTNN"), ("NNP", "NNP")]:
            return True
        return False

    bigrams = FreqDist()

    for d in data:
        tokens = d["tokens"]
        words_bigrams = nltk.bigrams([t[0] for t in tokens])
        tags_bigrams = nltk.bigrams([t[1] for t in tokens])

        for wb, tb in zip(words_bigrams, tags_bigrams):
            if check(wb, tb):
                bigrams[wb] += 1

    return bigrams
开发者ID:ayat-rashad,项目名称:eg_twitter,代码行数:26,代码来源:ner.py


示例3: similarity

def similarity(paper1,paper2):
    score=[]
    stops=nltk.corpus.stopwords.words('english') #stopwords to weed out

##compare the titles and score the word cosine similarity 
    title1 = paper1[1]
    title2 = paper2[1]
    tokens1=[w for w in nltk.word_tokenize(title1) if w not in stops]
    tokens2=[w for w in nltk.word_tokenize(title2) if w not in stops]
    fd1=nltk.FreqDist(tokens1)
    fd2=nltk.FreqDist(tokens2)
    keys=list(set(list(fd1.keys())+list(fd2.keys())))
    scoretemp=0
    for key in keys:
      scoretemp += fd1[key]*fd2[key]
    a = numpy.linalg.norm(numpy.asarray(list(fd1.values())))*numpy.linalg.norm(numpy.asarray(list(fd2.values())))
    if a:
      score.append(1-scoretemp/a)
    else:
      score.append(0)
    
##compare the abstracts and score single word cosine similarity 
    abstract1 = paper1[3]
    abstract2 = paper2[3]
    tokens1=[w for w in nltk.word_tokenize(abstract1) if w not in stops]
    tokens2=[w for w in nltk.word_tokenize(abstract2) if w not in stops]
    fd1=nltk.FreqDist(tokens1)
    fd2=nltk.FreqDist(tokens2)
    keys=list(set(list(fd1.keys())+list(fd2.keys())))
    scoretemp=0
    for key in keys:
      scoretemp += fd1[key]*fd2[key]
    a = numpy.linalg.norm(numpy.asarray(list(fd1.values())))*numpy.linalg.norm(numpy.asarray(list(fd2.values())))
    if a:
      score.append(1-scoretemp/(numpy.linalg.norm(numpy.asarray(list(fd1.values())))*numpy.linalg.norm(numpy.asarray(list(fd2.values())))))    
    else:
      score.append(0)

##compare the abstracts and score bigram cosine similarity 
    tokens1 = nltk.word_tokenize(abstract1)
    tokens2 = nltk.word_tokenize(abstract2)
    bgsall1 = nltk.bigrams(tokens1)
    bgsall2 = nltk.bigrams(tokens2)
    bgs1 = [bg for bg in bgsall1 if bg[0] not in stops and bg[1] not in stops]
    bgs2 = [bg for bg in bgsall2 if bg[0] not in stops and bg[1] not in stops]
    fd1=nltk.FreqDist(bgs1)
    fd2=nltk.FreqDist(bgs2)
    keys=list(set(list(fd1.keys())+list(fd2.keys())))
    scoretemp=0
    for key in keys:
      scoretemp += fd1[key]*fd2[key]
#    print(fd1.values())
    a = numpy.linalg.norm(numpy.asarray(list(fd1.values())))*numpy.linalg.norm(numpy.asarray(list(fd2.values())))
    if a:
      score.append(1-scoretemp/(numpy.linalg.norm(numpy.asarray(list(fd1.values())))*numpy.linalg.norm(numpy.asarray(list(fd2.values())))))
    else:
      score.append(0)

##total score is sum of the three scores    
    return sum(score)
开发者ID:niranjansd,项目名称:publishdat,代码行数:60,代码来源:Analyze.py


示例4: test

def test():
	uniDictList = [{} for x in range(6)]
	biDictList = [{} for x in range(6)]
	vocabSize = [0 for x in range(6)]
	totalSize = [0 for x in range(6)]
	biVocabSize = [0 for x in range(6)]
	bitotalSize = [0 for x in range(6)]
	numList = [0 for x in range(6)]
	numCorrect = total = 0

	# randomly split set 
	for entry in entryList:
		if random.random() > 0.10:
			entry.test = 0
		else:
			entry.test = 1

	# compute train dictionaries
	for entry in entryList:
		if entry.test == 0:
			for word in entry.review.split():
				uniDictList[entry.rating][word] = uniDictList[entry.rating].get(word,0)+1 

			for bigram in bigrams(entry.review.split()):
				biDictList[entry.rating][bigram] = biDictList[entry.rating].get(word,0)+1
							 
			numList[entry.rating] += 1


	print numList

	totalCount = reduce(lambda x,y: x+y, numList)


	# compute dictionary stats
	for x in xrange(1,6):
		vocabSize[x] = len(uniDictList[x].keys())
		totalSize[x] = reduce(lambda x,y: x+y,uniDictList[x].values())
		biVocabSize[x] = len(biDictList[x].keys())
		bitotalSize[x] = reduce(lambda x,y: x+y,biDictList[x].values())
			
	# testing
	for entry in entryList:
		if entry.test == 1:
			rankProb = [0 for x in range(6)]
			for x in range(1,6):
				for word in entry.review.split():
					rankProb[x] += math.log(uniDictList[x].get(word,1)) - math.log(vocabSize[x]+totalSize[x])
				for bigram in bigrams(entry.review.split()):
					rankProb[x] += math.log(biDictList[x].get(bigram,1)) - math.log(biVocabSize[x]+bitotalSize[x])

			map(lambda x: x*numList[entry.rating]/totalCount,rankProb)
			entry.pRating = rankProb.index(max(rankProb[1:6]))
			if entry.pRating == entry.rating:
				numCorrect += 1
			total += 1
		print bigrams(entry.review.split())


	return [numCorrect, total]
开发者ID:dsedra,项目名称:yproject,代码行数:60,代码来源:naive_baise_unibigram.py


示例5: estimateLikelihood

 def estimateLikelihood(self):
   uniqBigrams = set()
   uniqCount = 0
   for tweet in self._focusTweets['aae']:
     tweet = tweet.split('\t')
     for bigram in nltk.bigrams(tweet):
       try:
         dummy = self._biDict[bigram]
         self._likelihood['aae'][bigram] += 1
         self._likelihood['aae']['__BITOTAL__'] += 1
         if bigram not in uniqBigrams:
           uniqBigrams.add(bigram)
           uniqCount += 1
       except:
         continue
   self._likelihood['aae']['__BITOTAL__'] += uniqCount ## Adding vocab to total for add one smoothing!!
   sys.stderr.write("Likelihood Bigram Entries AAE:"+str(len(self._likelihood['aae']))+"\n")
   uniqBigrams = set()
   uniqCount = 0
   for tweet in self._focusTweets['mse']:
     tweet = tweet.split('\t')
     for bigram in nltk.bigrams(tweet):
       try:
         dummy = self._biDict[bigram]
         self._likelihood['mse'][bigram] += 1
         self._likelihood['mse']['__BITOTAL__'] += 1
         if bigram not in uniqBigrams:
           uniqBigrams.add(bigram)
           uniqCount += 1
       except:
         continue
   self._likelihood['mse']['__BITOTAL__'] += uniqCount
   sys.stderr.write("Likelihood Bigram Entries MSE:"+str(len(self._likelihood['mse']))+"\n")
开发者ID:phanigadde,项目名称:CSRelated,代码行数:33,代码来源:codeStore.py


示例6: to_bigram

  def to_bigram(self, termpos):
    words = [elem[0] for elem in termpos]
    pos_tags = [elem[1] for elem in termpos]

    b_words = nltk.bigrams(words)
    b_pos = nltk.bigrams(pos_tags)
    return (b_words, b_pos)
开发者ID:saidalfaraby,项目名称:SSLP,代码行数:7,代码来源:FeatureExtraction.py


示例7: main

def main():
	text = open('holmes.txt').read()
	tokens = nltk.wordpunct_tokenize(text)
	charList = []
	for word in tokens:
		for char in word:
			charList.append(char)
	fDistChars = nltk.FreqDist(charList)
	fDistWords = nltk.FreqDist(tokens)
	
	print("Answer to 1A, there are {} character types in the book, namely: \n{}".format(len(fDistChars),sorted(fDistChars)))
	print("\nAnswer to 1B, there are {} word types in the book, namely: \n{}".format(len(fDistWords),sorted(fDistWords)))
	
	bigramChars = nltk.bigrams(charList)
	trigramChars = nltk.trigrams(charList)

	print("\nAnswer to 1C, the 20 most common characters are: \nUnigrams: \n{}\nBigrams: \n{}\nTrigrams: \n{}".format(most_common(charList), 
		most_common(bigramChars), most_common(trigramChars)))

	bigramWords = nltk.bigrams(tokens)
	trigramWords = nltk.trigrams(tokens)

	print("\nAnswer to 1D, the 20 most common words are: \nUnigrams: \n{}\nBigrams: \n{}\nTrigrams: \n{}".format(most_common(tokens), 
		most_common(bigramWords), most_common(trigramWords)))
	
	bigram_measures = nltk.collocations.BigramAssocMeasures()
	finder = BigramCollocationFinder.from_words(tokens)
	scoredPMI = finder.score_ngrams(bigram_measures.pmi)
	scoredCHI = finder.score_ngrams(bigram_measures.chi_sq)
	
	print("\nAnswer to 2, the 20 most likely collocations are:\nPMI:\n{} \nChi's square\n{}" .format(scoredPMI[:20],scoredCHI[:20]))
	
	print("\nSpearmans correlation = {}".format(nltk.metrics.spearman.spearman_correlation(scoredPMI, scoredCHI)))
开发者ID:Martbov,项目名称:pta-group1,代码行数:33,代码来源:assignment1.py


示例8: textsimilarity

def textsimilarity(text1,text2):
    score=[]
    stops=nltk.corpus.stopwords.words('english') #stopwords to weed out
    stops = stops + ['we',',','.','(',')','using','new','propose','investigate']
    stops = stops + ['-','show','infer','novel','method']

#get tokens and bigrams from the text, either string or list of keywords
    if type(text1) is not list:
      alltokens = nltk.word_tokenize(text1.lower())
      allpairs = [list(pair) for pair in nltk.bigrams(alltokens)]
      tokens1 = [token for token in alltokens if token not in stops]
      pairs1 = [" ".join(bg) for bg in allpairs if bg[0] not in stops and bg[1] not in stops]
    else:
      alltokens = []
      allpairs1 = []
      for el in text1:
        atokens = nltk.word_tokenize(el.lower())
        alltokens += atokens
        apairs = [list(pair) for pair in nltk.bigrams(atokens)]
        allpairs += apairs
      tokens1 = [token for token in alltokens if token not in stops]
      pairs1 = [" ".join(bg) for bg in allpairs if bg[0] not in stops and bg[1] not in stops]

    if type(text2) is not list:
      tokens = nltk.word_tokenize(text2.lower())
      allpairs = [list(pair) for pair in nltk.bigrams(tokens)]
      tokens2 = [token for token in tokens if token not in stops]
      pairs2 = [" ".join(bg) for bg in allpairs if bg[0] not in stops and bg[1] not in stops]
    else:
      for el in text2:
        atokens = nltk.word_tokenize(el.lower())
        alltokens += atokens
        apairs = [list(pair) for pair in nltk.bigrams(atokens)]
        allpairs += apairs
      tokens2 = [token for token in alltokens if token not in stops]
      pairs2 = [" ".join(bg) for bg in allpairs if bg[0] not in stops and bg[1] not in stops]
      
###score single word cosine similarity
##    fd1=nltk.FreqDist(tokens1)
##    fd2=nltk.FreqDist(tokens2)
##    keys=list(set(list(fd1.keys())+list(fd2.keys())))
##    scoretemp=0
##    for key in keys:
##      scoretemp += fd1[key]*fd2[key]
##    score.append(1-scoretemp/(numpy.linalg.norm(numpy.asarray(list(fd1.values())))*numpy.linalg.norm(numpy.asarray(list(fd2.values())))))
##    
####score bigram cosine similarity 
##    fd1=nltk.FreqDist(pairs1)
##    fd2=nltk.FreqDist(pairs2)
##    keys=list(set(list(fd1.keys())+list(fd2.keys())))
##    scoretemp=0
##    for key in keys:
##      scoretemp += fd1[key]*fd2[key]
##    score.append(1-scoretemp/(numpy.linalg.norm(numpy.asarray(list(fd1.values())))*numpy.linalg.norm(numpy.asarray(list(fd2.values())))))
    score.append(sum(1 for token in tokens1 if token in tokens2))
    score.append(sum(1 for pair in pairs1 if pair in pairs2))
    print('done')
##total score is sum of the the scores    
    return sum(score)
开发者ID:niranjansd,项目名称:publishdat,代码行数:59,代码来源:Analyze.py


示例9: main

def main():
    
    # Corpus Location
    #for training data
    posTrainCorpus = 'C:/Users/Abhinav/Desktop/Course work/NLP/txt_sentoken/pos_train'
    negTrainCorpus = 'C:/Users/Abhinav/Desktop/Course work/NLP/txt_sentoken/neg_train'

    #for test data
    posTestCorpus = 'C:/Users/Abhinav/Desktop/Course work/NLP/txt_sentoken/pos_test'
    negTestCorpus = 'C:/Users/Abhinav/Desktop/Course work/NLP/txt_sentoken/neg_test'

    # Create Plain Text Corpus for training data
    posCorpus = PlaintextCorpusReader(posTrainCorpus, '.*')
    negCorpus = PlaintextCorpusReader(negTrainCorpus, '.*')


    # Create Plain Text Corpus for test data
    posTstCorpus = PlaintextCorpusReader(posTestCorpus, '.*')
    negTstCorpus = PlaintextCorpusReader(negTestCorpus, '.*')
    
    #GetBigrams
    posBigrams = nltk.bigrams(posCorpus.words())
    negBigrams = nltk.bigrams(negCorpus.words())

    #Get no. of words per corpus
    posWordLen = len(posCorpus.words())
    negWordLen = len(negCorpus.words())
    
    # Creating object of Lang_Model_classifier
    obj1 = Lang_Model_Classifier()
    obj1.freq_dst(posCorpus, negCorpus)
    
    #For negative test data
    for filename in os.listdir(negTestCorpus):
        wordSet =  negTstCorpus.words(filename)
    
        print '**Unigram**'
        unigr = obj1.perp(wordSet)
    
        print unigr
    
        print '**Bigram**'
        bigr = obj1.perpBi(nltk.bigrams(wordSet))
    
        print bigr
        
    #For positive test data    
    for filename in os.listdir(posTestCorpus):
        wordSet2 =  posTstCorpus.words(filename)
    
        print '**Unigram**'
        posunigr = obj1.perp(wordSet2)
    
        print posunigr
    
        print '**Bigram**'
        posbigr = obj1.perpBi(nltk.bigrams(wordSet2))
    
        print posbigr
开发者ID:abhinavmishra590,项目名称:Sentiment-Based-Document-Classification,代码行数:59,代码来源:Lang_Model.py


示例10: hybrid_cfdist

def hybrid_cfdist():
    sherlock_corpus = PlaintextCorpusReader(CORPUS_ROOT_SHERLOCK, '.*', encoding='utf-8')
    sherlock_bigrams = nltk.bigrams(sherlock_corpus.words())

    pokemon_corpus = PlaintextCorpusReader(CORPUS_ROOT_POKEMON, '.*', encoding='utf-8')
    pokemon_bigrams = nltk.bigrams(pokemon_corpus.words())

    return nltk.ConditionalFreqDist(sherlock_bigrams + pokemon_bigrams)
开发者ID:mikeholler,项目名称:CSC499-NLP,代码行数:8,代码来源:text_generation.py


示例11: how_is_often_used_in_text

def how_is_often_used_in_text():
    from nltk.corpus import brown

    brown_learned_text = brown.words(categories="learned")
    print sorted(set(b for (a, b) in nltk.bigrams(brown_learned_text) if a == "often"))
    # or use the tagged words for the actual POS tags
    brown_learned_tagged = brown.tagged_words(categories="learned", simplify_tags=True)
    fd = nltk.FreqDist([b[1] for (a, b) in nltk.bigrams(brown_learned_tagged) if a[0] == "often"])
    fd.tabulate()
开发者ID:prashiyn,项目名称:nltk-examples,代码行数:9,代码来源:ch05.py


示例12: wordlistfun

def wordlistfun(filename):
    minlength = 2
    lmtzr = nltk.stem.wordnet.WordNetLemmatizer()
    wordlist = []
    wordfreq = []
    hashlist = []
    hashfreq = []

    with open(filename, "r") as f:
        count_all = Counter()
        count_hash = Counter()
        count_only = Counter()
        count_bi = Counter()
        count_only2 = Counter()
        count_bigramonly = Counter()
        count_bigramstop = Counter()
        for line in f:
            try:
                tweet = json.loads(line)
                # Create a list with all the terms
                terms_stop = [
                    term for term in preprocess(tweet["text"]) if term.lower() not in stop
                ]  # Update the counter
                terms_hash = [term for term in preprocess(tweet["text"]) if term.lower().startswith("#")]
                terms_only = [
                    term
                    for term in preprocess(tweet["text"])
                    if term.lower() not in stop and not term.lower().startswith(("#", "@"))
                ]
                # mind the ((double brackets))
                # startswith() takes a tuple (not a list) if # we pass a list of inputs
                terms_only2 = [
                    term.encode("unicode-escape")
                    for term in preprocess(tweet["text"])
                    if term.lower() not in stop
                    and not term.lower().startswith(("#", "@"))
                    and not term.lower().startswith(("htt", "\u"))
                    and term.lower() not in [r"(?:(?:\d+,?)+(?:\.?\d+)?)"]
                    and len(term) > minlength
                ]

                terms_bigramstop = bigrams(terms_stop)
                terms_bigramonly = bigrams(terms_only2)

                count_all.update(terms_stop)
                count_hash.update(terms_hash)
                count_only.update(terms_only)
                count_only2.update(terms_only2)

                count_bigramonly.update(terms_bigramonly)
                count_bigramstop.update(terms_bigramstop)
            except:
                pass

        wordlist, wordfreq = zip(*count_only2.most_common())
        hashlist, hashfreq = zip(*count_hash.most_common())
    return wordlist, wordfreq, hashlist, hashfreq
开发者ID:btolga,项目名称:Insight_Project,代码行数:57,代码来源:wordcloud.py


示例13: do_ir2

def do_ir2(db, param):
    print 'Computazione di IR2', db, param, '...'

    def words(text):
        stopwords = set(nltk.corpus.stopwords.words('english'))
        return [w for w in nltk.word_tokenize(text.lower()) if w not in string.punctuation and w not in stopwords]

    class BigramsCorpus:
        def __init__(self, db, collection):
            self.client = MongoClient()[db][collection]

        def __iter__(self):
            for doc in self.client.find():
                yield [doc['_id']]

        def __len__(self):
            return self.client.count()

    bigram_corpus = BigramsCorpus('cordis', 'bi_grams')
    bigrams = Dictionary(bigram_corpus)

    project ={'$project': {'_id': 0, 'title': 1, 'reference': 1}}
    a = [project]
    project_corpus = MongoCorpus('cordis', 'projects', aggregate=a)

    n = max(bigrams.keys())
    dataset = []

    for doc in project_corpus:
        temp = bigrams.doc2bow([' '.join(x) for x in nltk.bigrams(words(doc['title']))])
        x = [0]*(n+1)
        for bi, _ in temp:
            x[bi] = 1
        dataset.append(x)

    alg = KMeans(n_clusters=int(param))
    alg.fit(dataset)

    clusters = defaultdict(list)
    for i, doc in enumerate(project_corpus):
        temp = bigrams.doc2bow([' '.join(x) for x in nltk.bigrams(words(doc['title']))])
        x = [0]*(n+1)
        for bi, _ in temp:
            x[bi] = 1
        p = alg.predict([x])
        clusters[p[0]].append(doc['reference'])

    mongo_clusters = []
    for k, v in clusters.items():
        mongo_clusters.append({'cluster': k, 'projects': v})

    # Mongo da questo errore: InvalidDocument: Cannot encode object: 0
    print mongo_clusters
    # Salva su collezione Mongo
    mongo = MongoClient()['g8']['ir2']
    mongo.insert_many(mongo_clusters)
    print 'Fatto!'
开发者ID:lum4chi,项目名称:IR,代码行数:57,代码来源:G8.py


示例14: extract_bigrams

def extract_bigrams(articleList, commentCount):
    featureMatrix = np.zeros([commentCount,100])

    index = 0
    stemmer = SnowballStemmer("english", ignore_stopwords=True)
    bagOfWords = []
    for art in articleList.items():        
        for comm in art[1]:
            mywords = words(comm.body)
            mywords = known_words(mywords)
            # Remove Stops
            filtered_words = [w for w in mywords if not w in stopwords.words('english')]
            # Stemming
            stemmed_words = [stemmer.stem(w) for w in filtered_words]
            bagOfWords += stemmed_words
            bagOfWords.append("\n")
            
    tempVector = dict()
        
    #Create your bigrams
    bgs = nltk.bigrams(bagOfWords)

    fdist = nltk.FreqDist(bgs)   
    
    for k in fdist.keys()[:100]:
        tempVector[k] = 0
    
    
    theKeys = tempVector.keys()
    
    for art in articleList.items():        
        for comm in art[1]:
            mywords = words(comm.body)
            mywords = known_words(mywords)
            # Remove Stops
            filtered_words = [w for w in mywords if not w in stopwords.words('english')]
            # Stemming
            stemmed_words = [stemmer.stem(w) for w in filtered_words]
            bgs = nltk.bigrams(stemmed_words)
            for word in (w for w in bgs if tempVector.has_key(w)):
                keyInd = theKeys.index(word)      
                featureMatrix[index][keyInd] += 1
                           
            index += 1
            if index % 100 == 0:
                print "extracted", index, "features"
        
            if index >= commentCount:
                break            
            
            
    
    
    print "non-zero",np.count_nonzero(featureMatrix)
    print "Percentage filled:%.2f" %(float(np.count_nonzero(featureMatrix))/(featureMatrix.shape[0]*featureMatrix.shape[1]))
    return featureMatrix
开发者ID:DirkBrand,项目名称:Comment-Classification,代码行数:56,代码来源:mainExtractor.py


示例15: featureSets

   def featureSets(data): #data accepted as (rating, list of words)
      fs = [] 
      for (r, words) in data:
         nicewords = [word.lower() for word in words if not isStopWord(word) and not isPunctuation(word)]
         for bigram in nltk.bigrams(nicewords):
            fs.append((BigramClassifier.features(bigram),r))

      return fs

      return [(BigramClassifier.features(bigram), r)  for bigram in nltk.bigrams(words)]
开发者ID:slyngbaek,项目名称:restaurant-reviewer,代码行数:10,代码来源:classifiers.py


示例16: get_joint_entropy

    def get_joint_entropy(string1, string2):
        first_bigram = list(nltk.bigrams(string1.lower()))
        second_bigram = list(nltk.bigrams(string2.lower()))
        combo = first_bigram + second_bigram
        bigram_dict = collections.Counter(combo)

        for i in bigram_dict:
            if i in first_bigram and i in second_bigram:
                value = float(bigram_dict[i]) / float(len(combo))
                yield value
开发者ID:TheBigGinge,项目名称:Analytics,代码行数:10,代码来源:entropy.py


示例17: joint_entropy

def joint_entropy(string1, string2):
    x = []
    bi1 = list(nltk.bigrams(string1.lower()))
    bi2 = list(nltk.bigrams(string2.lower()))
    combo = bi1 + bi2
    yes = list(set(combo))
    for i in yes:
        if i in bi1 and i in bi2:
            count = (float(bi1.count(i))+float(bi2.count(i)))/float(len(combo))
            x.append(count)
    calc = sum(i*np.log2(i) for i in x)*-1
    return calc
开发者ID:TheBigGinge,项目名称:Analytics,代码行数:12,代码来源:entropy.py


示例18: bigrami

def bigrami(documents,dg1,gg1,dg2,gg2):
    bigram = []
    stopwords = nltk.corpus.stopwords.words('english')
    for i in range(dg1,gg1):
        bigram.append([w for w in bigrams(documents[i][0])])
    for i in range(dg2,gg2):
        bigram.append([w for w in bigrams(documents[i][0])])
    result = []
    [result.extend(w) for w in bigram]
    result = [w for w in result if w[0] not in stopwords and w[1] not in stopwords and w[0] and nije_interpunkcija(w[0]) and nije_interpunkcija(w[1])]
    result = nltk.FreqDist(result)
    return result.keys()
开发者ID:Hermina,项目名称:Machine-learning,代码行数:12,代码来源:naivebayes.py


示例19: exercise_bigrams

def exercise_bigrams():
    sent = ["In", "the", "beginning", "God", "created", "the heaven", "and", "the earth"]
    print list(nltk.bigrams(sent))

    text = nltk.corpus.genesis.words("english-kjv.txt")
    bigrams = nltk.bigrams(text)
    cfd = nltk.ConditionalFreqDist(bigrams)

    word = "living"
    for i in range(15):
        print word
        word = cfd[word].max()
开发者ID:BurnellLiu,项目名称:LiuProject,代码行数:12,代码来源:chapter_02.py


示例20: jacquard_bigram

def jacquard_bigram(query):
    final=[]
    for a in file('enwiktionary.a.list'):
        a=a.rstrip()
        bigram=set(nltk.bigrams(a))
        q_bigram=set(nltk.bigrams(query))
        intersect=q_bigram.intersection(bigram)
        union=q_bigram.union(bigram)
        sim=float(len(intersect))/len(union)
        
        final.append([a,sim])
    final_sorted= sorted(final,key=lambda sim:sim[1], reverse=True)
    print final_sorted[:10]
开发者ID:jubimishra,项目名称:Data-Mining,代码行数:13,代码来源:jacquard_vs_levenshtein.py



注:本文中的nltk.bigrams函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python nltk.clean_html函数代码示例发布时间:2022-05-27
下一篇:
Python nltk.batch_ne_chunk函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap