• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

Python data.find函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中nltk.data.find函数的典型用法代码示例。如果您正苦于以下问题:Python find函数的具体用法?Python find怎么用?Python find使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了find函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: test_corpus_bleu

    def test_corpus_bleu(self):
        ref_file = find('models/wmt15_eval/ref.ru')
        hyp_file = find('models/wmt15_eval/google.ru')
        mteval_output_file = find('models/wmt15_eval/mteval-13a.output')

        # Reads the BLEU scores from the `mteval-13a.output` file.
        # The order of the list corresponds to the order of the ngrams.
        with open(mteval_output_file, 'r') as mteval_fin:
            # The numbers are located in the last 2nd line of the file.
            # The first and 2nd item in the list are the score and system names.
            mteval_bleu_scores = map(float, mteval_fin.readlines()[-2].split()[1:-1])

        with io.open(ref_file, 'r', encoding='utf8') as ref_fin:
            with io.open(hyp_file, 'r', encoding='utf8') as hyp_fin:
                # Whitespace tokenize the file.
                # Note: split() automatically strip().
                hypothesis = list(map(lambda x: x.split(), hyp_fin))
                # Note that the corpus_bleu input is list of list of references.
                references = list(map(lambda x: [x.split()],ref_fin))
                # Without smoothing.
                for i, mteval_bleu in zip(range(1,10), mteval_bleu_scores):
                    nltk_bleu = corpus_bleu(references, hypothesis, weights=(1.0/i,)*i)
                    # Check that the BLEU scores difference is less than 0.005 .
                    # Note: This is an approximate comparison; as much as
                    #       +/- 0.01 BLEU might be "statistically significant",
                    #       the actual translation quality might not be.
                    assert abs(mteval_bleu - nltk_bleu) < 0.005

                # With the same smoothing method used in mteval-v13a.pl
                chencherry = SmoothingFunction()
                for i, mteval_bleu in zip(range(1,10), mteval_bleu_scores):
                    nltk_bleu = corpus_bleu(references, hypothesis,
                                            weights=(1.0/i,)*i,
                                            smoothing_function=chencherry.method3)
                    assert abs(mteval_bleu - nltk_bleu) < 0.005
开发者ID:DrDub,项目名称:nltk,代码行数:35,代码来源:test_bleu.py


示例2: demo

def demo():
    from itertools import islice

#    zip_path = find('corpora/toolbox.zip')
#    lexicon = ToolboxData(ZipFilePathPointer(zip_path, 'toolbox/rotokas.dic')).parse()
    file_path = find('corpora/toolbox/rotokas.dic')
    lexicon = ToolboxData(file_path).parse()
    print('first field in fourth record:')
    print(lexicon[3][0].tag)
    print(lexicon[3][0].text)

    print('\nfields in sequential order:')
    for field in islice(lexicon.find('record'), 10):
        print(field.tag, field.text)

    print('\nlx fields:')
    for field in islice(lexicon.findall('record/lx'), 10):
        print(field.text)

    settings = ToolboxSettings()
    file_path = find('corpora/toolbox/MDF/MDF_AltH.typ')
    settings.open(file_path)
#    settings.open(ZipFilePathPointer(zip_path, entry='toolbox/MDF/MDF_AltH.typ'))
    tree = settings.parse(unwrap=False, encoding='cp1252')
    print(tree.find('expset/expMDF/rtfPageSetup/paperSize').text)
    settings_tree = ElementTree(tree)
    print(to_settings_string(settings_tree).encode('utf8'))
开发者ID:esabelhaus,项目名称:secret-octo-dubstep,代码行数:27,代码来源:toolbox.py


示例3: build_model

def build_model(fmt="binary"):
    print("Loading training data...")
    train_paths = [
        find("corpora/ace_data/ace.dev"),
        find("corpora/ace_data/ace.heldout"),
        find("corpora/ace_data/bbn.dev"),
        find("corpora/ace_data/muc.dev"),
    ]
    train_trees = load_ace_data(train_paths, fmt)
    train_data = [postag_tree(t) for t in train_trees]
    print("Training...")
    cp = NEChunkParser(train_data)
    del train_data

    print("Loading eval data...")
    eval_paths = [find("corpora/ace_data/ace.eval")]
    eval_trees = load_ace_data(eval_paths, fmt)
    eval_data = [postag_tree(t) for t in eval_trees]

    print("Evaluating...")
    chunkscore = ChunkScore()
    for i, correct in enumerate(eval_data):
        guess = cp.parse(correct.leaves())
        chunkscore.score(correct, guess)
        if i < 3:
            cmp_chunks(correct, guess)
    print(chunkscore)

    outfilename = "/tmp/ne_chunker_%s.pickle" % fmt
    print("Saving chunker to %s..." % outfilename)

    with open(outfilename, "wb") as out:
        pickle.dump(cp, out, -1)

    return cp
开发者ID:huderlem,项目名称:nltk,代码行数:35,代码来源:named_entity.py


示例4: nltk_download_corpus

def nltk_download_corpus(resource_path):
    """
    Download the specified NLTK corpus file
    unless it has already been downloaded.

    Returns True if the corpus needed to be downloaded.
    """
    from nltk.data import find
    from nltk import download
    from os.path import split

    # Download the wordnet data only if it is not already downloaded
    _, corpus_name = split(resource_path)

    ## From http://www.nltk.org/api/nltk.html ##
    # When using find() to locate a directory contained in a zipfile,
    # the resource name must end with the forward slash character.
    # Otherwise, find() will not locate the directory.
    ####
    # Helps when resource_path=='sentiment/vader_lexicon''
    if not resource_path.endswith('/'):
        resource_path = resource_path + '/'

    downloaded = False

    try:
        find(resource_path)
    except LookupError:
        download(corpus_name)
        downloaded = True

    return downloaded
开发者ID:jianjun66,项目名称:ChatterBot,代码行数:32,代码来源:utils.py


示例5: build_model

def build_model(fmt='binary'):
    print('Loading training data...')
    train_paths = [find('corpora/ace_data/ace.dev'),
                   find('corpora/ace_data/ace.heldout'),
                   find('corpora/ace_data/bbn.dev'),
                   find('corpora/ace_data/muc.dev')]
    train_trees = load_ace_data(train_paths, fmt)
    train_data = [postag_tree(t) for t in train_trees]
    print('Training...')
    cp = NEChunkParser(train_data)
    del train_data

    print('Loading eval data...')
    eval_paths = [find('corpora/ace_data/ace.eval')]
    eval_trees = load_ace_data(eval_paths, fmt)
    eval_data = [postag_tree(t) for t in eval_trees]

    print('Evaluating...')
    chunkscore = ChunkScore()
    for i, correct in enumerate(eval_data):
        guess = cp.parse(correct.leaves())
        chunkscore.score(correct, guess)
        if i < 3: cmp_chunks(correct, guess)
    print(chunkscore)

    outfilename = '/tmp/ne_chunker_%s.pickle' % fmt
    print('Saving chunker to %s...' % outfilename)

    with open(outfilename, 'wb') as outfile:
        pickle.dump(cp, outfile, -1)

    return cp
开发者ID:chatbotimporved,项目名称:chatbot,代码行数:32,代码来源:informationextraction.py


示例6: __init__

    def __init__(self):
        from nltk.data import find
        from nltk import download

        try:
            find('wordnet.zip')
        except LookupError:
            download('wordnet')
开发者ID:fmoliveira,项目名称:ChatterBot,代码行数:8,代码来源:word_net.py


示例7: namedEntityRecognizer

def namedEntityRecognizer():
    echo2("Performing NER on incoming stream")
    content = request.stream.read()
    #print content

    if Verbose:
        echo2("Incoming content is "+content)
    PICKLE = "averaged_perceptron_tagger.pickle"
    AP_MODEL_LOC = 'file:'+str(find('taggers/averaged_perceptron_tagger/'+PICKLE))
    tagger = PerceptronTagger(load=False)
    tagger.load(AP_MODEL_LOC)
    pos_tag = tagger.tag
    start = time.time()
    #date_time = timex.tag(content)
    tokenized = nltk.word_tokenize(content)
    tagged = pos_tag(tokenized)
    namedEnt = nltk.ne_chunk(tagged, binary=True)
    names = extract_entity_names(namedEnt, 'NE')
    #names.extend(date_time)
    result = {"result" : "success", "names" : names}
    if Units:
        grammar = '''unit: {<CD><NNS>?<NN.*>?},
                     unit: {<CD><JJ>?<NN.*>}
                  '''
        parser = nltk.RegexpParser(grammar)
        units = extract_entity_names(parser.parse(tagged),'unit')
        result['units'] = units
    jsonDoc = json.dumps(result, sort_keys=True, indent=4, separators=(',', ': '))
    end = time.time()
    print "NER took "+str(end - start)+" seconds"
    return jsonDoc
开发者ID:anirbanmishra,项目名称:Content_Evaluation,代码行数:31,代码来源:NLTKRestServer.py


示例8: _vocabulary

 def _vocabulary(self):
     return (
         data.find('stemmers/porter_test/porter_vocabulary.txt')
             .open(encoding='utf-8')
             .read()
             .splitlines()
     )
开发者ID:DrDub,项目名称:nltk,代码行数:7,代码来源:test_stem.py


示例9: _vocabulary

 def _vocabulary(self):
     with closing(
         data.find('stemmers/porter_test/porter_vocabulary.txt').open(
             encoding='utf-8'
         )
     ) as fp:
         return fp.read().splitlines()
开发者ID:rmalouf,项目名称:nltk,代码行数:7,代码来源:test_stem.py


示例10: demo

def demo():
    from nltk.data import find
    corpus_root = find('corpora/childes/data-xml/Eng-USA/')
    childes = CHILDESCorpusReader(corpus_root, u'.*.xml')

    # describe all corpus
    for file in childes.fileids()[:5]:
        corpus = ''
        corpus_id = ''
        for (key,value) in childes.corpus(file)[0].items():
            if key == "Corpus": corpus = value
            if key == "Id": corpus_id = value
        print 'Reading', corpus,corpus_id,' .....'
        print "words:", childes.words(file)[:7],"..."
        print "words with replaced words:", childes.words(file, replace=True)[:7]," ..."
        print "words with pos tags:", childes.words(file, pos=True)[:7]," ..."
        print "words (only MOT):", childes.words(file, speaker='MOT')[:7], "..."
        print "words (only CHI):", childes.words(file, speaker='CHI')[:7], "..."
        print "stemmed words:", childes.words(file, stem=True)[:7]," ..."
        print "words with relations and pos-tag:", childes.words(file, relation=True)[:5]," ..."
        print "sentence:", childes.sents(file)[:2]," ..."
        for (participant, values) in childes.participants(file)[0].items():
                for (key, value) in values.items():
                    print "\tparticipant", participant, key, ":", value
        print "num of sent:", len(childes.sents(file))
        print "num of morphemes:", len(childes.words(file, stem=True))
        print "age:", childes.age(file)    
        print "age in month:", childes.age(file, month=True)    
        print "MLU:", childes.MLU(file)
        print '\r'
开发者ID:johndpope,项目名称:jazzparser,代码行数:30,代码来源:childes.py


示例11: test_vocabulary_nltk_mode

 def test_vocabulary_nltk_mode(self):
     self._test_against_expected_output(
         PorterStemmer.NLTK_EXTENSIONS,
         data.find('stemmers/porter_test/porter_nltk_output.txt')
             .open(encoding='utf-8')
             .read()
             .splitlines()
     )
开发者ID:DrDub,项目名称:nltk,代码行数:8,代码来源:test_stem.py


示例12: _get_tagger

def _get_tagger(lang=None):
    if lang == 'rus':
        tagger = PerceptronTagger(False)
        ap_russian_model_loc = 'file:' + str(find(RUS_PICKLE))
        tagger.load(ap_russian_model_loc)
    else:
        tagger = PerceptronTagger()
    return tagger
开发者ID:Weiming-Hu,项目名称:text-based-six-degree,代码行数:8,代码来源:__init__.py


示例13: __init__

 def __init__(self):
     from nltk.data import find
     from nltk import download
     import os
     
     # Download the wordnet data only if it is not already downloaded
     wordnet_path = None
     if os.name == 'nt':
         wordnet_path = os.path.join(os.getenv('APPDATA'), 'nltk_data',
                                             'corpora', 'wordnet.zip')
     else:
         wordnet_path = os.path.join(os.path.expanduser('~'), 'nltk_data',
                                             'corpora', 'wordnet.zip')
     try:
         if not os.path.isfile(wordnet_path):
             find('wordnet.zip')
     except LookupError:
         download('wordnet')
开发者ID:AugustoQueiroz,项目名称:ChatterBot,代码行数:18,代码来源:wordnet.py


示例14: demo

def demo(corpus_root=None):
    """
    The CHILDES corpus should be manually downloaded and saved
    to ``[NLTK_Data_Dir]/corpora/childes/``
    """
    if not corpus_root:
        from nltk.data import find

        corpus_root = find('corpora/childes/data-xml/Eng-USA/')

    try:
        childes = CHILDESCorpusReader(corpus_root, '.*.xml')
        # describe all corpus
        for file in childes.fileids()[:5]:
            corpus = ''
            corpus_id = ''
            for (key, value) in childes.corpus(file)[0].items():
                if key == "Corpus":
                    corpus = value
                if key == "Id":
                    corpus_id = value
            print('Reading', corpus, corpus_id, ' .....')
            print("words:", childes.words(file)[:7], "...")
            print(
                "words with replaced words:",
                childes.words(file, replace=True)[:7],
                " ...",
            )
            print("words with pos tags:", childes.tagged_words(file)[:7], " ...")
            print("words (only MOT):", childes.words(file, speaker='MOT')[:7], "...")
            print("words (only CHI):", childes.words(file, speaker='CHI')[:7], "...")
            print("stemmed words:", childes.words(file, stem=True)[:7], " ...")
            print(
                "words with relations and pos-tag:",
                childes.words(file, relation=True)[:5],
                " ...",
            )
            print("sentence:", childes.sents(file)[:2], " ...")
            for (participant, values) in childes.participants(file)[0].items():
                for (key, value) in values.items():
                    print("\tparticipant", participant, key, ":", value)
            print("num of sent:", len(childes.sents(file)))
            print("num of morphemes:", len(childes.words(file, stem=True)))
            print("age:", childes.age(file))
            print("age in month:", childes.age(file, month=True))
            print("MLU:", childes.MLU(file))
            print()

    except LookupError as e:
        print(
            """The CHILDES corpus, or the parts you need, should be manually
        downloaded from https://childes.talkbank.org/data-xml/ and saved at
        [NLTK_Data_Dir]/corpora/childes/
            Alternately, you can call the demo with the path to a portion of the CHILDES corpus, e.g.:
        demo('/path/to/childes/data-xml/Eng-USA/")
        """
        )
开发者ID:rmalouf,项目名称:nltk,代码行数:57,代码来源:childes.py


示例15: __init__

    def __init__(self):
        from nltk.data import find
        from nltk import download
        import os

        # Download the punkt data only if it is not already downloaded
        punkt_path = None
        if os.name == 'nt':
            punkt_path = os.path.join(os.getenv('APPDATA'), 'nltk_data',
                                                'tokenizers', 'punkt.zip')
        else:
            punkt_path = os.path.join(os.path.expanduser('~'), 'nltk_data',
                                                'tokenizers', 'punkt.zip')
        try:
            if not os.path.isfile(punkt_path):
                find('punkt.zip')
        except LookupError:
            download('punkt')
开发者ID:AugustoQueiroz,项目名称:ChatterBot,代码行数:18,代码来源:tokenizer.py


示例16: __init__

 def __init__(self, load=True):
     '''
     :param load: Load the pickled model upon instantiation.
     '''
     self.model = AveragedPerceptron()
     self.tagdict = {}
     self.classes = set()
     if load:
         AP_MODEL_LOC = 'file:'+str(find('taggers/averaged_perceptron_tagger/'+PICKLE))
         self.load(AP_MODEL_LOC)
开发者ID:GINK03,项目名称:KindleReferencedIndexScore,代码行数:10,代码来源:perceptron.py


示例17: _get_tagger

def _get_tagger(lang=None):
    if lang == "rus":
        tagger = PerceptronTagger(False)
        ap_russian_model_loc = "file:" + str(find(RUS_PICKLE))
        tagger.load(ap_russian_model_loc)
    elif lang == "eng":
        tagger = PerceptronTagger()
    else:
        tagger = PerceptronTagger()
    return tagger
开发者ID:DrDub,项目名称:nltk,代码行数:10,代码来源:__init__.py


示例18: test_vocabulary_original_mode

    def test_vocabulary_original_mode(self):
        # The list of stems for this test was generated by taking the
        # Martin-blessed stemmer from
        # http://tartarus.org/martin/PorterStemmer/c.txt
        # and removing all the --DEPARTURE-- sections from it and
        # running it against Martin's test vocabulary.

        with closing(data.find('stemmers/porter_test/porter_original_output.txt').open(encoding='utf-8')) as fp:
            self._test_against_expected_output(
                PorterStemmer.ORIGINAL_ALGORITHM,
                fp.read().splitlines()
            )

        self._test_against_expected_output(
            PorterStemmer.ORIGINAL_ALGORITHM,
            data.find('stemmers/porter_test/porter_original_output.txt')
                .open(encoding='utf-8')
                .read()
                .splitlines()
        )
开发者ID:alpaco42,项目名称:ML_Spring_2018,代码行数:20,代码来源:test_stem.py


示例19: nltk_download_corpus

def nltk_download_corpus(resource_path):
    """
    Download the specified NLTK corpus file
    unless it has already been downloaded.

    Returns True if the corpus needed to be downloaded.
    """
    from nltk.data import find
    from nltk import download
    from os.path import split, sep
    from zipfile import BadZipfile

    # Download the NLTK data only if it is not already downloaded
    _, corpus_name = split(resource_path)

    # From http://www.nltk.org/api/nltk.html
    # When using find() to locate a directory contained in a zipfile,
    # the resource name must end with the forward slash character.
    # Otherwise, find() will not locate the directory.
    #
    # Helps when resource_path=='sentiment/vader_lexicon''
    if not resource_path.endswith(sep):
        resource_path = resource_path + sep

    downloaded = False

    try:
        find(resource_path)
    except LookupError:
        download(corpus_name)
        downloaded = True
    except BadZipfile:
        raise BadZipfile(
            'The NLTK corpus file being opened is not a zipfile, '
            'or it has been corrupted and needs to be manually deleted.'
        )

    return downloaded
开发者ID:dawnpower,项目名称:ChatterBot,代码行数:38,代码来源:utils.py


示例20: _str2records

def _str2records(filename, rel):
    """
    Read a file into memory and convert each relation clause into a list.
    """ 
    recs = []
    path = find("corpora/chat80/%s" % filename)
    for line in path.open():
        if line.startswith(rel):
            line = re.sub(rel+r'\(', '', line)
            line = re.sub(r'\)\.$', '', line)
            line = line[:-1]
            record = line.split(',')
            recs.append(record)
    return recs
开发者ID:approximatelylinear,项目名称:nltk,代码行数:14,代码来源:chat80.py



注:本文中的nltk.data.find函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python data.load函数代码示例发布时间:2022-05-27
下一篇:
Python words.words函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap