• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

Python tag.UnigramTagger类代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中nltk.tag.UnigramTagger的典型用法代码示例。如果您正苦于以下问题:Python UnigramTagger类的具体用法?Python UnigramTagger怎么用?Python UnigramTagger使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。



在下文中一共展示了UnigramTagger类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: pos_tag

def pos_tag(pos_type, tokenized_sent):
	if pos_type == 'unigram':
		brown_train = pickle.load(open('res/brown_train.pkl', 'rb'))
		unigram_tagger = UnigramTagger(brown_train)
		return unigram_tagger.tag(tokenized_sent)
	elif pos_type == 'max_pos':
		return nltk.pos_tag(tokenized_sent)		
开发者ID:merkhofer,项目名称:parsel,代码行数:7,代码来源:nltk_magic.py


示例2: tag_unigrams_by_topic

 def tag_unigrams_by_topic(self, dict_of_sentences_by_topic):
     tagged_unigrams_by_topic = {}
     train_sents = mac_morpho.tagged_sents()[:5000]
     tagger = UnigramTagger(train_sents)
     for k, v in dict_of_sentences_by_topic.items():
         tagged_unigrams_by_topic[k] = tagger.batch_tag(dict_of_sentences_by_topic[k])
     return tagged_unigrams_by_topic
开发者ID:EduardoCarvalho,项目名称:nltkPhraseDetector,代码行数:7,代码来源:extractPhrases.py


示例3: trainUniTnT

 def trainUniTnT(self):
     """train unigram and tnt seperatly without DefaultTagger"""
     self.split_into_folds()
     for k in range(1, (self.folds + 1)):
         train_sents = sum(self.foldlist[: (self.folds - 1)], [])
         tnt_tagger = tnt.TnT(N=100)
         tnt_tagger.train(train_sents)
         print(str(k) + " fold: tnt evaluated")
         unigram = UnigramTagger(train_sents)
         print(str(k) + " fold: unigram evaluated")
         to_tag = [untag(i) for i in self.foldlist[self.folds - 1]]
         self.tnt_tagged += tnt_tagger.tag_sents(to_tag)
         self.uni_tagged += unigram.tag_sents(to_tag)
         self.org_tagged += self.foldlist[self.folds - 1]
         self.foldlist = [self.foldlist[self.folds - 1]] + self.foldlist[: (self.folds - 1)]
     self.tnt = tnt_tagger
     self.unigram = unigram
     self.tnt_avg_acc = accuracy(sum(self.org_tagged, []), sum(self.tnt_tagged, []))
     self.uni_avg_acc = accuracy(sum(self.org_tagged, []), sum(self.uni_tagged, []))
     print("Accuracy of concatenated tnt-tagged sentences: ", self.tnt_avg_acc)
     print("Accuracy of concatenated unigram-tagged sentences: ", self.uni_avg_acc)
     (self.tnt_tagprecision, self.tnt_tagrecall) = self.tagprecision_recall(
         tnt_tagger, self.tnt_tagged, self.org_tagged
     )
     (self.unigram_tagprecision, self.unigram_tagrecall) = self.tagprecision_recall(
         unigram, self.uni_tagged, self.org_tagged
     )
     # delete following values so that trainRegexp has the inicial values
     self.org_tagged = []
     self.foldlist = []
     for i in range(1, self.folds + 1):
         self.foldlist.append(self.create_fold(i))
开发者ID:Batene,项目名称:Bamanankan,代码行数:32,代码来源:CrossValidation.py


示例4: tag_words

	def tag_words(self, words, sents):
		train_sents = treebank.tagged_sents()
		tagger = UnigramTagger(train_sents)
		test_sents = tagger.tag(sents[0])
		# test_sents = treebank.tagged_sents()[3000:]
		# print treebank.tagged_sents()[1:]
		# print "accuracy: " + str(self._tagger.evaluate(test_sents))
		# print self._tagger.tag(words)
		# print test_sents
		print tagger.evaluate(test_sents)
开发者ID:jayvachon,项目名称:managerisk-reflection-search,代码行数:10,代码来源:sentiment-analysis.py


示例5: baseline

def baseline(tagged_sentences):
    from nltk.tag import UnigramTagger
    from nltk.tag import DefaultTagger
    from collections import Counter

    # lowercase everything
    # remove all instances of non-universal tags for propper comparison with
    # the other methods
    new_tagged_sentences = []
    for sent in tagged_sentences:
        sent = [(x[0].lower(), x[1]) for x in sent]
        sent = [x for x in sent if x[1] in _UNI]
        new_tagged_sentences.append(sent)
    tagged_sentences = new_tagged_sentences

    # size of corpus
    corpus_size = sum([len(sent) for sent in tagged_sentences])
    print('Corpus size: {} docs'.format(len(tagged_sentences)))
    print('Corpus size: {} tokens'.format(corpus_size))
    
    # train/test split
    test_pct = 0.3
    test_len = int(len(tagged_sentences) * test_pct)
    test_idx = len(tagged_sentences) - test_len
    train_set = tagged_sentences[:test_idx]
    test_set = tagged_sentences[test_idx:]
    print('Train set: {} docs'.format(len(train_set)))
    print('Test set: {} docs'.format(len(test_set)))

    # calculate test set size in tokens
    test_size = sum([len(sent) for sent in test_set])
    print('Test set: {} tokens'.format(test_size))

    # calculate most comman tag in the train set
    # this should be 'NOUN'
    tag_dist = []
    for sent in train_set:
        tag_dist += [x[1] for x in sent]
    counts = Counter()
    counts.update(tag_dist)
    most_common = counts.most_common(1)[0][0]
    print('Most common tag: {}'.format(most_common))

    # Create model
    backoff = DefaultTagger(most_common)
    tagger = UnigramTagger(train=train_set, backoff=backoff, cutoff=5)

    # Evaluate
    acc = tagger.evaluate(test_set)
    print('Baseline: {}'.format(acc))
开发者ID:lrei,项目名称:xlime_twitter_corpus,代码行数:50,代码来源:experiment.py


示例6: getUnigramTaggerAccuracy

def getUnigramTaggerAccuracy(trainingSet, testingSet):
    # trains and returns the accuracy of the UnigramTagger

    # get untagged sentences and gold POS tags
    testingUntaggedSentences = [[taggedWord[0] for taggedWord in sentence] for sentence in testingSet]
    testingGoldPOSTags = [[taggedWord[1] for taggedWord in sentence] for sentence in testingSet]

    # train tagger
    unigramTagger = UnigramTagger(trainingSet)

    # test tagger and get predicted POS tags
    unigramTaggedSentences = unigramTagger.tag_sents(testingUntaggedSentences)
    unigramTaggedSentencesPOSTags = [[taggedWord[1] for taggedWord in sentence] for sentence in unigramTaggedSentences]

    # calculate and return accuracy
    return calculateAccuracy(testingGoldPOSTags, unigramTaggedSentencesPOSTags)
开发者ID:kyajmiller,项目名称:LING-539,代码行数:16,代码来源:q2.py


示例7: tag_penn

def tag_penn(words):
    """
    Tokenizes text by using a Penn Treebank tagged sentence and word tokenizer.

    Parameters
    ----------
    words: A list of strings.

    Returns
    -------
    A list of tuples of (str, str)
    """

    pt_tagger = UnigramTagger(treebank.tagged_sents())
    tags = pt_tagger.tag(words)

    return tags
开发者ID:nwngeek212,项目名称:NaturalLanguageProcessing,代码行数:17,代码来源:helper.py


示例8: contextual_rules

def contextual_rules(wikicorpus_dir, context_file):
    sentences = wikicorpus(wikicorpus_dir, words=1000000)

    ANONYMOUS = "anonymous"
    for s in sentences:
        for i, (w, tag) in enumerate(s):
            if tag == "NP": # NP = proper noun in Parole tagset.
                s[i] = (ANONYMOUS, "NP")

    ctx = fntbl37()

    tagger = UnigramTagger(sentences)
    tagger = BrillTaggerTrainer(tagger, ctx, trace=0)
    tagger = tagger.train(sentences, max_rules=100)

    #print tagger.evaluate(wikicorpus(10000, start=1))

    with open(context_file, "w") as f:
        for rule in tagger.rules():
            f.write("%s\n" % rule)
开发者ID:jgsogo,项目名称:lingwars,代码行数:20,代码来源:pattern_wikicorpus.py


示例9: make_pos_model

def make_pos_model(model_type):
    now = time.time()

    reader = TaggedCorpusReader('.', 'greek_training_set.pos')
    train_sents = reader.tagged_sents()
    if model_type == 'unigram':
        tagger = UnigramTagger(train_sents)
        file = 'unigram.pickle'
    elif model_type == 'bigram':
        tagger = BigramTagger(train_sents)
        file = 'bigram.pickle'
    elif model_type == 'trigram':
        tagger = TrigramTagger(train_sents)
        file = 'trigram.pickle'
    elif model_type == 'backoff':
        tagger1 = UnigramTagger(train_sents)
        tagger2 = BigramTagger(train_sents, backoff=tagger1)
        tagger = TrigramTagger(train_sents, backoff=tagger2)
        file = '123grambackoff.pickle'
    elif model_type == 'tnt':
        tagger = tnt.TnT()
        tagger.train(train_sents)
        file = 'tnt.pickle'
    else:
        print('Invalid model_type.')

    _dir = os.path.expanduser('~/greek_models_cltk/taggers/pos')
    path = os.path.join(_dir, file)
    with open(path, 'wb') as f:
        pickle.dump(tagger, f)

    print('Completed training {0} model in {1} seconds to {2}.'.format(model_type, time.time() - now, path))
开发者ID:wencanluo,项目名称:greek_treebank_perseus,代码行数:32,代码来源:make_pos_models.py


示例10: tag_linked

def tag_linked(words, default_tag='INFO'):
    """
    Tokenizes text by using a Penn Treebank tagged sentence and word tokenizers.
    Uses DefaultTagger to assign "default_tag" to any element missed by Penn Treebank tagger.

    Parameters
    ----------
    words: A list of strings.

    Returns
    -------
    A list of tuples of (str, str)
    :param default_tag:
    """

    default_tagger = DefaultTagger(default_tag)
    pt_tagger = UnigramTagger(treebank.tagged_sents())

    pt_tagger._taggers = [pt_tagger, default_tagger]

    tags = pt_tagger.tag(words)

    return tags
开发者ID:nwngeek212,项目名称:NaturalLanguageProcessing,代码行数:23,代码来源:helper.py


示例11: PyTenseShift

class PyTenseShift(object):

    """Initialization of PyTenseShift objects.
    
    The important part when you use the PlPyTenseShift is that
    we allow you to implmenent your own Tagger to optimize your
    results in translating from present to past tense. So, you need
    to implement the taggerinterface and change the second line of
    this code
    """
    def __init__(self, corpus, isPl):
        if isPl:
            self.tagger = FirstTagger(corpus)
        else:
            dtag = DefaultTagger("NN")
            self.__utag = UnigramTagger(corpus.tagged_sents(), backoff = dtag)

    """ Tokenize the input sentence into words.
    This kind of representation is better to evaluate.
    
    """
    def _tokenize(self, tense, isPl):
        if isPl:
            return self.tagger.tag(tense)
        else:
            return self.__utag.tag(tokenize(tense))

    def getPastTense(self, tense):
        """Translates sentence given in present tense into past tense 
        
        Args:
            sentence (str): Sentence to translate
        Returns:
            str. Sentence in past tense
        """
        raise NotImplementedError("abstract method")
开发者ID:perfidia,项目名称:pytenseshift,代码行数:36,代码来源:__init__.py


示例12: write_word_list

from nltk.corpus import brown
from nltk.tag import UnigramTagger
import cPickle as pickle

INPUT_FILE = "/dfs/scratch0/googlengrams/2012-eng-fic/info/commonnonstop-1900-2000-8-6.pkl"

def write_word_list(filename, word_list):
    out_fp = open(filename, "w")
    print >> out_fp, "\n".join(word_list)

if __name__ == '__main__':
    in_fp = open(INPUT_FILE, "rb") 
    words = pickle.load(in_fp)
    tagger = UnigramTagger(brown.tagged_sents())
    good_words = []
    for word in words:
        tag = tagger.tag([word])[0][1]
        if tag == None:
            continue
        if "NP" in tag:
            continue
        good_words.append(word)
    write_word_list("brown.txt", good_words)
开发者ID:viveksck,项目名称:langchange,代码行数:23,代码来源:brown_words.py


示例13:

import nltk
from nltk.tag import UnigramTagger
from nltk.corpus import treebank
training= treebank.tagged_sents()[:7000]
unitagger=UnigramTagger(training)
print(treebank.sents()[0])
print(unitagger.tag(treebank.sents()[0]))
开发者ID:PacktPublishing,项目名称:Mastering-Natural-Language-Processing-with-Python,代码行数:7,代码来源:ch4_16.py


示例14:

import nltk
from nltk.tag import UnigramTagger
from nltk.tag import DefaultTagger
from nltk.corpus import treebank
testing = treebank.tagged_sents()[2000:]
training= treebank.tagged_sents()[:7000]
tag1=DefaultTagger('NN')
tag2=UnigramTagger(training,backoff=tag1)
print(tag2.evaluate(testing))
开发者ID:xenron,项目名称:sandbox-da-python,代码行数:9,代码来源:ch4_19.py


示例15: UnigramTagger

import nltk
import json

from nltk.corpus import brown
from nltk.tag import UnigramTagger
tagger = UnigramTagger(brown.tagged_sents(tagset='universal'))
sent = ['Mitchell', 'decried', 'the', 'high', 'rate', 'of', 'unemployment']
for word, tag in tagger.tag(sent):
	if tag == "VERB":
		print(word, '->', tag)


verbs_tagged = open("../assets/inputText/verbs_tagged_questions.txt", 'w+')
with open("../assets/inputText/all_questions.txt", 'r') as all_lines:
	for line in all_lines:
		splitLine = line.split(' ')
		for word, tag in tagger.tag(splitLine):
			if tag == "VERB":
				verbs_tagged.write(word + "\n")
				#verbs_tagged.write(word + " \"" + line[:-1] + "\"\n")
				



开发者ID:diana-wang,项目名称:NLP_Research,代码行数:20,代码来源:unigramTagging.py


示例16: UnigramTagger

#!/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals, division
from nltk.tag import UnigramTagger


if __name__ == '__main__':
    model = {u'Péter': 'N', 'Enikő': 'N', 'szeret': 'V', 'Marit': 'Nacc'}
    tagger = UnigramTagger(model=model)

    print(tagger.tag(['Péter', 'Enikő', 'szeret', 'Marit']))


开发者ID:davidpgero,项目名称:hungarian-nltk,代码行数:11,代码来源:unigramm_tagger.py


示例17: UnigramTagger

import nltk
from nltk.corpus import brown
from nltk.tag import UnigramTagger
tagger = UnigramTagger(brown.tagged_sents(categories='news')[:700])
sentence = ['John','and','Smith','went','to','NY','and','Germany']
for word, tag in tagger.tag(sentence):
    print(word,'->',tag)
开发者ID:xenron,项目名称:sandbox-da-python,代码行数:7,代码来源:ch6_12.py


示例18: cltk_pos_cv

def cltk_pos_cv(full_training_set, local_dir_rel):
    print("full_training_set", full_training_set)

    unigram_accuracies = []
    bigram_accuracies = []
    trigram_accuracies = []
    backoff_accuracies = []
    tnt_accuracies = []

    with open(full_training_set) as f:
        training_set_string = f.read()

    pos_set = training_set_string.split('\n\n')  # mk into a list

    sentence_count = len(pos_set)  # 3473
    tenth = math.ceil(int(sentence_count) / int(10))

    random.seed(0)
    random.shuffle(pos_set)

    def chunks(l, n):
        """Yield successive n-sized chunks from l.
        http://stackoverflow.com/a/312464
        """
        for i in range(0, len(l), n):
            yield l[i:i+n]

    # a list of 10 lists
    ten_parts = list(chunks(pos_set, tenth))  # a list of 10 lists with ~347 sentences each

    #for counter in list(range(10)):
    for counter, part in list(enumerate(ten_parts)):
        # map test list to part of given loop
        test_set = ten_parts[counter]  # or: test_set = part
        
        # filter out this loop's test index
        training_set_lists = [x for x in ten_parts if x is not ten_parts[counter]]
        
        # next concatenate the list together into 1 file ( http://stackoverflow.com/a/952952 )
        training_set = [item for sublist in training_set_lists for item in sublist]
            
        # save shuffled tests to file (as NLTK trainers expect)
        #local_dir_rel = '~/cltk_data/user_data'
        local_dir = os.path.expanduser(local_dir_rel)
        if not os.path.isdir(local_dir):
            os.makedirs(local_dir)

        test_path = os.path.join(local_dir, 'test.pos')
        with open(test_path, 'w') as f:
            f.write('\n\n'.join(test_set))

        train_path = os.path.join(local_dir, 'train.pos')
        with open(train_path, 'w') as f:
            f.write('\n\n'.join(training_set))

        # read POS corpora
        print("local_dir", local_dir)
        train_reader = TaggedCorpusReader(local_dir, 'train.pos')
        train_sents = train_reader.tagged_sents()

        test_reader = TaggedCorpusReader(local_dir, 'test.pos')
        test_sents = test_reader.tagged_sents()
        
        print('Loop #' + str(counter))
        # make unigram tagger
        unigram_tagger = UnigramTagger(train_sents)
        # evaluate unigram tagger
        unigram_accuracy = None
        unigram_accuracy = unigram_tagger.evaluate(test_sents)
        unigram_accuracies.append(unigram_accuracy)
        print('Unigram:', unigram_accuracy)
        
        # make bigram tagger
        bigram_tagger = BigramTagger(train_sents)
        # evaluate bigram tagger
        bigram_accuracy = None
        bigram_accuracy = bigram_tagger.evaluate(test_sents)
        bigram_accuracies.append(bigram_accuracy)
        print('Bigram:', bigram_accuracy)
        
        # make trigram tagger
        trigram_tagger = TrigramTagger(train_sents)
        # evaluate trigram tagger
        trigram_accuracy = None
        trigram_accuracy = trigram_tagger.evaluate(test_sents)
        trigram_accuracies.append(trigram_accuracy)
        print('Trigram:', trigram_accuracy)
        
        # make 1, 2, 3-gram backoff tagger
        tagger1 = UnigramTagger(train_sents)
        tagger2 = BigramTagger(train_sents, backoff=tagger1)
        tagger3 = TrigramTagger(train_sents, backoff=tagger2)
        # evaluate trigram tagger
        backoff_accuracy = None
        backoff_accuracy = tagger3.evaluate(test_sents)
        backoff_accuracies.append(backoff_accuracy)
        print('1, 2, 3-gram backoff:', backoff_accuracy)
        
        # make tnt tagger
        tnt_tagger = tnt.TnT()
#.........这里部分代码省略.........
开发者ID:wencanluo,项目名称:cltk_pos,代码行数:101,代码来源:pos_cltk_cv.py


示例19:

import nltk
from nltk.corpus import treebank
from nltk.tag import UnigramTagger
training= treebank.tagged_sents()[:7000]
unitagger=UnigramTagger(training)
testing = treebank.tagged_sents()[2000:]
print(unitagger.evaluate(testing))
开发者ID:xenron,项目名称:sandbox-da-python,代码行数:7,代码来源:ch4_17.py


示例20: print

brown_tagged_sents = brown.tagged_sents(categories='news')
#print(brown_tagged_sents)
# [[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL')], ...]
default_tagger = nltk.DefaultTagger('NN')
print(default_tagger.evaluate(brown_tagged_sents))
# 0.13089484257215028

brown_tagged_sents2 = [[('The', 'AT'), ('Fulton', 'NP-TL'), ('manner', 'NN')]]
print(default_tagger.evaluate(brown_tagged_sents2))
# 0.3333333333333333

train_data = brown_tagged_sents[:int(len(brown_tagged_sents) * 0.9)]
test_data = brown_tagged_sents[int(len(brown_tagged_sents) * 0.9):]

unigram_tagger = UnigramTagger(train_data, backoff=default_tagger)
print(unigram_tagger.evaluate(test_data))
# 0.835841722316356

bigram_tagger = BigramTagger(train_data, backoff=unigram_tagger)
print(bigram_tagger.evaluate(test_data))
# 0.8454101465164956

trigram_tagger = TrigramTagger(train_data, backoff=bigram_tagger)
print(trigram_tagger.evaluate(test_data))
# 0.8427190272102063

regexp_tagger = RegexpTagger(
    [( r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
    ( r'(The|the|A|a|An|an)$', 'AT'), # articles
    ( r'.*able$', 'JJ'), # adjectives
开发者ID:jzm17173,项目名称:Learn,代码行数:30,代码来源:词性标注器.py



注:本文中的nltk.tag.UnigramTagger类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python stanford.NERTagger类代码示例发布时间:2022-05-27
下一篇:
Python tag.StanfordNERTagger类代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap