• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

Python nltk.ngrams函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中nltk.ngrams函数的典型用法代码示例。如果您正苦于以下问题:Python ngrams函数的具体用法?Python ngrams怎么用?Python ngrams使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了ngrams函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: ngrams

 def ngrams(self, ns=[2, 3, 5]):
     _p = ["/".join(t) for t in zip(self.SUF, self.POS)]
     for n in ns:
         ngf = {"Ngram(N={})_{}".format(n, "_".join(t)): 1 for t in ngrams(self.SUF, n)}
         ngfp = {"NgramP(N={})_{}".format(n, "_".join(t)): 1 for t in ngrams(_p, n)}
     self.features.update(ngf)
     self.features.update(ngfp)
开发者ID:tuxedocat,项目名称:precure,代码行数:7,代码来源:feature_extractor.py


示例2: update_freqs

 def update_freqs(self, doc_text, id_str):
     for bigram in list(ngrams(doc_text, 2)):
         k = bigram[0] + u"_" + bigram[1]
         self.bicount.update([k])
         self.bigram_to_ids[k] = self.bigram_to_ids.get(k, []) + [id_str]
     for trigram in list(ngrams(doc_text, 3)):
         k = trigram[0] + u"_" + trigram[1] + u"_" + trigram[2]
         self.tricount.update([k])
         self.trigram_to_ids[k] = self.trigram_to_ids.get(k, []) + [id_str]
开发者ID:jtmurphy89,项目名称:twitter_challenge,代码行数:9,代码来源:part1.py


示例3: get_gram_ratio

def get_gram_ratio(w2v, text1, text2, n_grams_1=1, n_grams_2=1, n_jobs=1):
    t1 = list(ngrams(text1.split(), n_grams_1))
    t2 = list(ngrams(text2.split(), n_grams_2))
    pairs = list(iter_product(t1, t2, repeat=1))
    res = list(map(lambda x: similarity(w2v, x), pairs))
    if len(res) == 0:
        return 0
    else:
        return np.mean(res)
开发者ID:KhaoticMind,项目名称:kaggle-homedepot,代码行数:9,代码来源:helper_processing.py


示例4: ngrams_extract

def ngrams_extract(string):
    if random.random() < SAMPLE_RATE:
        print '[*]',string
    l = list
    grams = l(ngrams(string,2)) + l(ngrams(string,3)) + l(ngrams(string,4)) + l(ngrams(string,5))
    SIZE = 1024
    vec = zeros((SIZE,))
    for t in grams:
        vec[hash(t)%SIZE]+=1
    return log(vec+1.0)
开发者ID:joshsaxe,项目名称:eXposeDeepNeuralNetwork,代码行数:10,代码来源:features.py


示例5: build_ngram

 def build_ngram(source):
     ngram_set = {}
     for key, value in source.items():
         ngram = []
         for line in value:
             if IS_PAD:
                 ngram.extend(nltk.ngrams(line.strip(), NGRAM_LEVEL, pad_left=True, pad_right=True, pad_symbol='SSS'))
             else:
                 ngram.extend(nltk.ngrams(line.strip(), NGRAM_LEVEL))
         ngram_set[key] = ngram
     return ngram_set
开发者ID:Tiotao,项目名称:CS3245HW1,代码行数:11,代码来源:build_test_LM.py


示例6: read_data

def read_data(type):
    datapath = '../data/' + type + '/'
    data = {}
    maxindex = 500
    count = 0
    unigrams = []
    bigrams = []
    dependecies = []
    for c in string.ascii_uppercase:
        data[c] = {}
        for i in range(1, maxindex):
            filename = datapath + c + str(i)
            txtpath = filename + '.data'
            metapath = filename + '.meta'
            text = read_file(txtpath)

            meta = read_file(metapath)
            if text is not None:
                count += 1
                # print (count)
                data[c][i] = {'text': text[0], 'meta': parse_meta(meta)}
                tokens = nltk.word_tokenize(text[0])

                data[c][i]['tokens'] = tokens
                data[c][i]['length'] = len(tokens)
                s = remove_punct(text[0])
                tokens = nltk.word_tokenize(remove_punct(s.lower()))

                data[c][i]['unigrams'] = list(nltk.ngrams(tokens, 1))
                data[c][i]['bigrams'] = list(nltk.ngrams(tokens, 2))

                # data[c][i]['dependencies'] = dependency_parse(text[0])
                # deppath = filename + '.dep'
                # with open (deppath, 'w') as f:
                #     json.dump(data[c][i]['dependencies'],f)
                # with open (deppath, 'r') as f:
                #     data[c][i]['dependencies'] = json.load(f)


                unigrams.extend(data[c][i]['unigrams'])
                bigrams.extend(data[c][i]['bigrams'])
                # dependecies.extend(data[c][i]['dependencies'])

        data[c]['sequences'] = gen_sequences(data[c])
        data['unigram_model'] = create_model(unigrams, maxfeat=5000, minfreq=3)
        data['bigram_model'] = create_model(bigrams, maxfeat=5000, minfreq=3)
        # data['dependencies'] = create_model(dependecies, maxfeat=5000, minfreq=3)

    # pprint.pprint (data['unigram_model'])
    # pprint.pprint (data['bigram_model'])
    # pprint.pprint (data['dependencies'])

    # print(type, count)
    return data
开发者ID:patwaria,项目名称:stance_classification,代码行数:54,代码来源:stance_classification.py


示例7: lookup_phrases

def lookup_phrases(sentence, noun_types, ignore_case=False):
    phrases = ngrams(sentence, 3) + ngrams(sentence, 2) + ngrams(sentence, 1)
    matches = []
    for phrase in phrases:
        if contains_noun(phrase):
            phrase_str = u' '.join(w.form for w in phrase)
            if ignore_case:
                phrase_str = phrase_str.lower()
            types = noun_types.get(phrase_str)
            if types:
                matches.append((phrase, types))
    return sorted(matches)
开发者ID:Noahs-ARK,项目名称:semafor,代码行数:12,代码来源:markup_sentence.py


示例8: extract_ngrams

    def extract_ngrams (self, memes):
        for meme_type in memes:
            for meme in memes[meme_type]:
                top_unigrams = meme[0]
                bottom_unigrams = meme[1]
                all_unigrams = top_unigrams + bottom_unigrams

                top_bigrams = ngrams (meme[0], 2)
                bottom_bigrams = ngrams (meme[1], 2)
                all_bigrams = top_bigrams + bottom_bigrams

                self.add_ngrams(key, top_unigrams, bottom_unigrams, all_unigrams, top_bigrams, bottom_bigrams, all_bigrams)
开发者ID:AlexeyMK,项目名称:DATASS,代码行数:12,代码来源:NgramsManager.py


示例9: get_gram_ratio

def get_gram_ratio(text1, text2, w2v, n_grams_1=1, n_grams_2=1, w=30, h=2000):
    arr = np.ndarray((w, h), np.float32)
    arr.fill(0)
    t1 = list(ngrams(text1.split(), n_grams_1))
    t2 = list(ngrams(text2.split(), n_grams_2))
    for i in range(len(t1)):
        for j in range(len(t2)):
            try:
                arr[i, j] = w2v.n_similarity(t1[i], t2[j])
            except:
                pass
    return arr
开发者ID:KhaoticMind,项目名称:kaggle-homedepot,代码行数:12,代码来源:neural_test.py


示例10: generate_location_vector

    def generate_location_vector(self, branch, index):
        if branch.text is not None:
            branch.text = branch.text.encode('ascii', 'ignore')

            if not branch.getchildren():
                sentences = branch.text.split('. ')
                for sentence in range(0, len(sentences)):
                    #sentence_location = (("{0}[{1}]".format(index, sentence)), sentences[sentence])
                    words = sentences[sentence].split()

                    for doc_word in range(0, len(words)):
                        word_location = (("{0}[{1}][{2}]".format(index, sentence, doc_word)), words[doc_word])
                        # any change in line below should be replicated in corpus.py also
                        symbols = ".,[]();:<>+=&+%[email protected]#~?{}|"
                        whitespace = "                       "
                        replace = maketrans(symbols, whitespace)
                        doc_word = word_location[1].translate(replace)
                        doc_word = doc_word.lstrip()
                        doc_word = doc_word.rstrip()
                        if len(doc_word) > 1 and not len(doc_word) > 16:
                            self.doc_words.append(doc_word)

                    doc_bigrams = bigrams(words)
                    if not len(doc_bigrams) < 1:
                        doc_bigrams = self.n_gram_cleaner(doc_bigrams)
                        for bi_gram in doc_bigrams:
                            bi_gram = ' '.join(bi_gram)
                            self.bi_grams.append(bi_gram)

                    doc_trigrams = trigrams(words)
                    if not len(doc_trigrams) < 1:
                        doc_trigrams = self.n_gram_cleaner(doc_trigrams)
                        for tri_gram in doc_trigrams:
                            tri_gram = ' '.join(tri_gram)
                            self.tri_grams.append(tri_gram)

                    doc_fourgrams = ngrams(words, 4)
                    if not len(doc_fourgrams) < 1:
                        doc_fourgrams = self.n_gram_cleaner(doc_fourgrams)
                        for four_gram in doc_fourgrams:
                            four_gram = ' '.join(four_gram)
                            self.four_grams.append(four_gram)

                    doc_fivegrams = ngrams(words, 5)
                    if not len(doc_fivegrams) < 1:
                        doc_fivegrams = self.n_gram_cleaner(doc_fivegrams)
                        for five_gram in doc_fivegrams:
                            five_gram = ' '.join(five_gram)
                            self.five_grams.append(five_gram)

            else:
                for subtree in range(0, len(branch)):
                    LocationVector.generate_location_vector(self, branch[subtree], ("{0}[{1}]".format(index, subtree)))
开发者ID:arunenigma,项目名称:deva_algo,代码行数:53,代码来源:doc_analyzer.py


示例11: get_top_ngrams_tfidf

def get_top_ngrams_tfidf(text,collection,NGRAM=2,cutoff=100,docs=None):
    bigs = nltk.ngrams(text,NGRAM)
    print 'totally',len(bigs),'bigrams'
    bigs = remove_website_stopwords(bigs)
    freqdist = nltk.FreqDist(bigs)
    topwords = freqdist.keys()[:cutoff]
    # print len(topwords),'topwords:',topwords[:30],freqdist[topwords[0]],freqdist[topwords[1]]
    from math import log
    if True: #do_tfidf
	df = {}
	df_les = {}
	df_time = {}
	tfidf ={}
	for doc_id, text in docs.items():
	    words = [w for w in nltk.ngrams(text,NGRAM)]
	    les_id,time_id = doc_id.split(':')
	    time_id = time_id.replace('.csv','')
	    time_id = time_id[0:8]
	    for w in words:
		df.setdefault(w,set())
		df[w].add(doc_id)
		df_les.setdefault(w,set())
		df_les[w].add(les_id)
		df_time.setdefault(w,set())
		df_time[w].add(time_id)
        _cutoff=10000
        _topwords = freqdist.keys()[:_cutoff]
	df0,df1,df2={},{},{}
        for w in _topwords:
            # print w
	    try: df0[w] = len(df[w])
	    except: df0[w] = 0
	    try: df1[w] = len(df_les[w])
	    except: df1[w] = 0
	    try: df2[w] = len(df_time[w])
	    except: df2[w] = 0
	    tfidf[w] = freqdist[w]/(1+df0[w])
	# print df0
        #get sorted words in decreasing order of tfidf values
        sortedwords = sorted(tfidf.items(), key=itemgetter(1), reverse=True) 
        sortedwords = sortedwords[:cutoff]
        topwords = [w for w,s in sortedwords]
        sortedwords0 = sorted(df0.items(), key=itemgetter(1), reverse=True) 
        sortedwords1 = sorted(df1.items(), key=itemgetter(1), reverse=True) 
        sortedwords2 = sorted(df2.items(), key=itemgetter(1), reverse=True) 
        print 'TF-IDF topwords:'
        print len(topwords),'topwords:',sortedwords[:50],freqdist[topwords[0]],freqdist[topwords[1]]
	print sortedwords0[:30]
	print sortedwords1[:30]
	print sortedwords2[:30]
        return topwords,freqdist,df0,df1,df2
    return topwords,freqdist
开发者ID:iamhighman,项目名称:GoogleNewsAnalysis,代码行数:52,代码来源:nltk_utils.py


示例12: __call__

 def __call__(self, words):
     grams = list(ngrams(words, 2)) + list(ngrams(words, 3))
     positives = [
         (i, len(gram), gram) for i, gram in enumerate(grams)
         if self.colls[len(gram)][gram]
     ]
     if not positives:
         return words
     positives.sort(key=lambda x: (x[1], len(words) - x[0]), reverse=True)
     matches, covered = self.__non_overlapping(positives)
     unigrams = [(i, w) for i, w in enumerate(words) if i not in covered]
     catted = sorted(matches + unigrams)
     return zip(*catted)[1]
开发者ID:JordiCarreraVentura,项目名称:wlp,代码行数:13,代码来源:Collocations.py


示例13: generateLocationVector

    def generateLocationVector(self, branch, index):
        if branch.text is not None:
            branch.text = branch.text.encode('ascii', 'ignore')

            if not branch.getchildren():
                sentences = branch.text.split('. ')

                for sentence in range(0, len(sentences)):
                    #sentence_location = (("{0}[{1}]".format(index, sentence)), sentences[sentence])
                    words = sentences[sentence].split()

                    for word in range(0, len(words)):
                        word_location = (("{0}[{1}][{2}]".format(index, sentence, word)), words[word])
                        symbols = ",[]();:<>+=&+%[email protected]#~?{}|"
                        whitespace = "                      "
                        replace = maketrans(symbols, whitespace)
                        spec_word = word_location[1].translate(replace)
                        spec_word = spec_word.lstrip()
                        spec_word = spec_word.rstrip()

                        if len(spec_word) > 1 and not len(spec_word) > 16:
                            self.spec_words.append(spec_word)

                    bi_grams = bigrams(words)
                    if not len(bi_grams) < 1:
                        for bi_gram in bi_grams:
                            bi_gram = ' '.join(bi_gram)
                            self.bi_grams.append(bi_gram)

                    tri_grams = trigrams(words)
                    if not len(tri_grams) < 1:
                        for tri_gram in tri_grams:
                            tri_gram = ' '.join(tri_gram)
                            self.tri_grams.append(tri_gram)

                    four_grams = ngrams(words, 4)
                    if not len(four_grams) < 1:
                        for four_gram in four_grams:
                            four_gram = ' '.join(four_gram)
                            self.four_grams.append(four_gram)

                    five_grams = ngrams(words, 5)
                    if not len(five_grams) < 1:
                        for five_gram in five_grams:
                            five_gram = ' '.join(five_gram)
                            self.five_grams.append(five_gram)                    

            else:
                for subtree in range(0, len(branch)):
                    Corpus.generateLocationVector(self, branch[subtree], ("{0}[{1}]".format(index, subtree)))
开发者ID:arunenigma,项目名称:Scenario-Mining,代码行数:50,代码来源:corpus.py


示例14: __init__

    def __init__(self, text, random_seed=5, shingle_length=5, minhash_size=200):
        split_text = text.split()
        if len(split_text) < shingle_length:
            raise ValueError(u'input text is too short for specified shingle length of {}'.format(shingle_length))

        self.minhash = []
        self.shingles = ngrams(split_text, shingle_length)

        for hash_seed in generate_random_seeds(minhash_size, random_seed):
            min_value = float('inf')
            for shingle in ngrams(split_text, shingle_length):
                value = mmh3.hash(' '.join(shingle), hash_seed)
                min_value = min(min_value, value)
            self.minhash.append(min_value)
开发者ID:steven-s,项目名称:text-shingles,代码行数:14,代码来源:shingles.py


示例15: train

 def train(self, words, tagged=False):
     if tagged is True:
         tags = []
         for i in range(len(words)):
             tags.append(words[i][1])
         self.ngrams = list(nltk.ngrams(tags, self.n))
     else:
         # text = nltk.word_tokenize(words)
         tagged_words = nltk.pos_tag(words)
         universal_tags = [nltk.map_tag('en-ptb', 'universal', tag) for word, tag in tagged_words]
         self.ngrams = list(nltk.ngrams(universal_tags, self.n))
     self.frequencies = nltk.FreqDist(self.ngrams)
     self.probs_ng = nltk.MLEProbDist(self.frequencies)
     print self.probs_ng
开发者ID:sofiabroome,项目名称:wordpredictor,代码行数:14,代码来源:GrammarModel.py


示例16: jacquard_fivegram

def jacquard_fivegram(query):
    final=[]
    n=4
    for a in file('enwiktionary.a.list'):
        a=a.rstrip()
        fivegram=set(nltk.ngrams(a,5))
        q_fivegram=set(nltk.ngrams(query,5))
        intersect=q_fivegram.intersection(fivegram)
        union=q_fivegram.union(fivegram)
        sim=float(len(intersect))/len(union)
        
        final.append([a,sim])
    final_sorted= sorted(final,key=lambda sim:sim[1], reverse=True)
    print final_sorted[:10]
开发者ID:jubimishra,项目名称:Data-Mining,代码行数:14,代码来源:jacquard_vs_levenshtein.py


示例17: count_alliteration

def count_alliteration(tokens):
    allit_instances = []
    #ignore stopwords
    tokens = [token for token in tokens if not(is_punctuation(token) or is_stopword(token))]
    
    bigrams = nltk.ngrams(tokens,2)
    for one,two in bigrams:
        if has_alliteration(one,two):
            allit_instances.append((one,two))
    trigrams = nltk.ngrams(tokens,3)
    for one,two,three in trigrams:
        #the not avoids double counting
        if has_alliteration(one,three) and not has_alliteration(one,two):
            allit_instances.append((one,two,three))
    return len(allit_instances)
开发者ID:BAH-DSST,项目名称:QuantifyingRhetoric_ODSCEast2016,代码行数:15,代码来源:rhetoric.py


示例18: calc_precision

def calc_precision(n,translation, reference):
    total = 0
    correct = 0
    for i in range(min(len(translation),len(reference))):
        tra_ngrams = nltk.ngrams(translation[i].split(), n)
        ref_ngrams = nltk.ngrams(reference[i].split(), n)
        total += min(len(ref_ngrams),len(tra_ngrams))
        for ng in tra_ngrams:
            if(ng in ref_ngrams):
                correct += 1
    print("total:" + str(total)+ ", correct: "+ str(correct))
    if(total == 0):
        return(0)
    precision = float(correct)/total
    return(precision)
开发者ID:jvalansi,项目名称:Machine_Translation,代码行数:15,代码来源:bleu.py


示例19: get_date_from_utterance

def get_date_from_utterance(tokenized_utterance: List[Token],
                            year: int = 1993) -> List[datetime]:
    """
    When the year is not explicitly mentioned in the utterance, the query assumes that
    it is 1993 so we do the same here. If there is no mention of the month or day then
    we do not return any dates from the utterance.
    """

    dates = []

    utterance = ' '.join([token.text for token in tokenized_utterance])
    year_result = re.findall(r'199[0-4]', utterance)
    if year_result:
        year = int(year_result[0])
    trigrams = ngrams([token.text for token in tokenized_utterance], 3)
    for month, tens, digit in trigrams:
        # This will match something like ``september twenty first``.
        day = ' '.join([tens, digit])
        if month in MONTH_NUMBERS and day in DAY_NUMBERS:
            try:
                dates.append(datetime(year, MONTH_NUMBERS[month], DAY_NUMBERS[day]))
            except ValueError:
                print('invalid month day')

    bigrams = ngrams([token.text for token in tokenized_utterance], 2)
    for month, day in bigrams:
        if month in MONTH_NUMBERS and day in DAY_NUMBERS:
            # This will match something like ``september first``.
            try:
                dates.append(datetime(year, MONTH_NUMBERS[month], DAY_NUMBERS[day]))
            except ValueError:
                print('invalid month day')

    fivegrams = ngrams([token.text for token in tokenized_utterance], 5)
    for tens, digit, _, year_match, month in fivegrams:
        # This will match something like ``twenty first of 1993 july``.
        day = ' '.join([tens, digit])
        if month in MONTH_NUMBERS and day in DAY_NUMBERS and year_match.isdigit():
            try:
                dates.append(datetime(int(year_match), MONTH_NUMBERS[month], DAY_NUMBERS[day]))
            except ValueError:
                print('invalid month day')
        if month in MONTH_NUMBERS and digit in DAY_NUMBERS and year_match.isdigit():
            try:
                dates.append(datetime(int(year_match), MONTH_NUMBERS[month], DAY_NUMBERS[digit]))
            except ValueError:
                print('invalid month day')
    return dates
开发者ID:apmoore1,项目名称:allennlp,代码行数:48,代码来源:atis_tables.py


示例20: __fromcursor__

 def __fromcursor__(self):
     self.data = []
     for document in c['Body'][self.source].find({
         'term' : self.term,
         'date' : {'$gt' : self.start_date, '$lt' : self.stop_date},
         'str_type' : self.str_type.__name__,
         'n' : self.n
     }, {
     'documents' : 1
     }, no_cursor_timeout=True):
         for _id in document['documents']:
             comment = get_comment(_id, self.source)
             gram_list = []
             for ngram in ngrams(comment[self.str_type.__name__], self.n):
                 gram_list.append(Gram(ngram).term)
             if self.position:
                 loc = gram_list.index(self.term) + position
                 self[gram_list[loc]] + 1
             else:
                 gram_list.remove(self.term)
                 for gram in gram_list:
                     self[gram] += 1
     try:
         self * (sum(self) ** -1)
     except ZeroDivisionError:
         raise ValueError("No comments with term {} found".format(self.term))
     self.__tocollection__()
开发者ID:deniederhut,项目名称:redicorpus,代码行数:27,代码来源:objects.py



注:本文中的nltk.ngrams函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python nltk.parse_cfg函数代码示例发布时间:2022-05-27
下一篇:
Python nltk.ne_chunk_sents函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap