• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

Python nltk.ne_chunk_sents函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中nltk.ne_chunk_sents函数的典型用法代码示例。如果您正苦于以下问题:Python ne_chunk_sents函数的具体用法?Python ne_chunk_sents怎么用?Python ne_chunk_sents使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了ne_chunk_sents函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: parts_of_speech

 def parts_of_speech(self, corpus):
     "returns named entity chunks in a given text"
     sentences = nltk.sent_tokenize(corpus)  #Uso toknenizer para español
     tokenized = [nltk.word_tokenize(sentence) for sentence in sentences]
     pos_tags  = [nltk.pos_tag(sentence) for sentence in tokenized]
     chunked_sents = nltk.ne_chunk_sents(pos_tags, binary=True)
     return chunked_sents
开发者ID:IIC2113-Grupo3-2015,项目名称:Procesador-de-Textos,代码行数:7,代码来源:GeneradorRelaciones.py


示例2: chunkIntoEntities

def chunkIntoEntities( text ):
    entities = []
    sentences = sentenceTokenization(text)
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
    
    def extract_entity_names(t):
        entity_names = []
    
        if hasattr(t, 'label') and t.label:
            if t.label() == 'NE':
                entity_names.append(' '.join([child[0] for child in t]))
            else:
                for child in t:
                    entity_names.extend(extract_entity_names(child))
    
        return entity_names
    
    for idx,tree in enumerate(chunked_sentences):
        entity_names = extract_entity_names(tree)
        entities.extend(entity_names)
    
    chunked_content = splitContentbyDelimiter(text, entities)
    return [chunked_content, entities]
开发者ID:dxr1988,项目名称:NLTK-Research,代码行数:25,代码来源:nltk_helper.py


示例3: getEntities

def getEntities(filename):
    with open('harry.txt', 'r') as f:
        sample = f.read()
    sample = sample.decode('unicode_escape').encode('ascii','ignore')
    print "sentence tokenize..."
    sentences = nltk.sent_tokenize(sample)
    print len(sentences)
    sentences = sentences[:len(sentences)/30]
    print len(sentences)
    print "word tokenize..."
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    print "POS tagging..."
    tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
    print "Chunking..."
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
    entity_names = []
    print "getting entities..."
    print "total sentences = ", len(chunked_sentences)
    for i, tree in enumerate(chunked_sentences):
        if i%100==0:
            print "on sentence", i
        entity_names.extend(extract_entity_names(tree))
    uniques = list(set(entity_names))
    #only returned named entities that are 2 words or more
    output = [u for u in unique if len(u.split(" ")) >= 2]
开发者ID:wellesleynlp,项目名称:meganokeefe-finalproject,代码行数:25,代码来源:entities.py


示例4: extract_entity_names

def extract_entity_names(text):
    sentences = nltk.sent_tokenize(text)
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)

    def entity_names(t):
        names = []

        if hasattr(t, 'label') and t.label:
            if t.label() == 'NE':
                names.append(' '.join([child[0] for child in t]))
            else:
                for child in t:
                    names.extend(entity_names(child))

        return names

    names = []
    for tree in chunked_sentences:
        # Print results per sentence
        # print extract_entity_names(tree)

        names.extend(entity_names(tree))

    return set(names)
开发者ID:michal3141,项目名称:geomedia,代码行数:26,代码来源:ner_extract.py


示例5: nltk_extract_ner

def nltk_extract_ner(text):
    """
    Use of NLTK NE
    :param text:
    :return: list of all extracted NE
    """
    sentences = nltk.sent_tokenize(text)
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=False)

    d = defaultdict(list)

    def extract_entity_names(t):
        entity_names = []

        if hasattr(t, 'label') and t.label:
            #if it is recognized as NE add with key of its type
            if t.label() in ne_types:
                d[t.label()].append(' '.join([child[0] for child in t]))
            else:
                for child in t:
                    entity_names.extend(extract_entity_names(child))

        return entity_names

    for tree in chunked_sentences:
        # Get results per sentence
        extract_entity_names(tree)


    # return all entity names
    return d
开发者ID:bfurlan,项目名称:IE4MAS,代码行数:33,代码来源:nltk_ner_extractor.py


示例6: get_entities

def get_entities(story):
	entities = {}

	'''wrong code, before nltk.pos_tag(), 
		story need to be divide into sentences with',' and '.' using nltk.sent_tokenize(),
		then tokenize each sentence to tokens with ',' and '.' using nltk.word_tokenize.
	storytokens = tokenizer(story) #remove '\'', ',' and '.'
	pos_words = nltk.pos_tag(storytokens)
	'''

	sentences = nltk.sent_tokenize(story)
	tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
	#label 'Boy' and 'Scout' as 'NNP' respectively 
	tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
	#label 'Boy Scout' as 'NE'(entity)
	chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)

	#
	entity_in_sentences = []
	for tree in chunked_sentences:
		#extract_entity_names(tree) find entities in each chunked_sentence
		entity_in_sentences.extend(extract_entity_names(tree))
	
	#delete repeat entities in all chunked_sentences
	entities_unique = set(entity_in_sentences)
	#create entities(dict object)
	i = 0
	for entity in entities_unique:
		entities[entity] = i
		i += 1

	return entities
开发者ID:YuzhouWang,项目名称:657-Project,代码行数:32,代码来源:preprocess_data.py


示例7: extractKeywords

def extractKeywords(data):
    array = []
    logging.warning('NLTK processing starts:')
    logging.warning(data)
    for i, item in enumerate(data):
        sample = data[i]
        sentences = nltk.sent_tokenize(sample)
        tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
        tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
        chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)

        def extract_entity_names(t):
            entity_names = []
            if hasattr(t, 'label') and t.label:
                if t.label() == 'NE':
                    entity_names.append(' '.join([child[0].lower() for child in t]))
                else:
                    for child in t:
                        entity_names.extend(extract_entity_names(child))
            return entity_names

        entity_names = []
        for tree in chunked_sentences:
            entity_names.extend(extract_entity_names(tree))
        for item in entity_names:
            if item not in stops:
                array.append(item)
    logging.warning('NLTK processing finished:')
    logging.warning(array)
    return array
开发者ID:KseniiaBelorustceva,项目名称:text-analyser,代码行数:30,代码来源:app.py


示例8: extract_named_entities

def extract_named_entities(text_blocks):
    """
    Return a list of named entities extracted from provided text blocks (list of text strings).
    """
    sentences = []
    for text in text_blocks:
        sentences.extend(nltk.sent_tokenize(text))

    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)

    def extract_entity_names(t):
        entity_names = []

        if hasattr(t, 'label'):
            if t.label() == 'NE':
                entity_names.append(' '.join([child[0] for child in t]))
            else:
                for child in t:
                    entity_names.extend(extract_entity_names(child))

        return entity_names

    entity_names = []
    for tree in chunked_sentences:
        entity_names.extend(extract_entity_names(tree))

    return set(entity_names)
开发者ID:hasgeek,项目名称:coaster,代码行数:29,代码来源:nlp.py


示例9: get_top_NEs

def get_top_NEs(tagged_sentences, n=TOP_NERs):
    """ Return the n longest named entities of a text """
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
    entity_names = []
    for tree in chunked_sentences:
        entity_names.extend(extract_entity_names(tree))

    return sorted(entity_names, key=len, reverse=True)[:n]
开发者ID:pan-webis-de,项目名称:maluleka16,代码行数:8,代码来源:source-retrieval.py


示例10: chunk_sentences

def chunk_sentences(sentences):

    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    sentences = [nltk.pos_tag(sent) for sent in sentences]

    chunked_sentences = nltk.ne_chunk_sents(sentences, binary=True)

    return chunked_sentences
开发者ID:Jwpe,项目名称:entity-extractor,代码行数:8,代码来源:extract_named_entities.py


示例11: ie_process

def ie_process(document):
    "returns named entity chunks in a given text"
    sentences = nltk.sent_tokenize(document)
    tokenized = [nltk.word_tokenize(sentence.translate(string.punctuation)) for sentence in sentences]
    pos_tags  = [nltk.pos_tag(sentence) for sentence in tokenized]
    #print(pos_tags)
    chunked_sents = nltk.ne_chunk_sents(pos_tags, binary=True)
    return chunked_sents
开发者ID:vipmunot,项目名称:Sentiment-Analysis,代码行数:8,代码来源:NLP+processing+and+Named+Entity+_+Relationship+Extraction.py


示例12: extract_person_names

def extract_person_names(text):
    sentences = nltk.sent_tokenize(text)
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    tagged_sentences = [pos_tagger.tag(sentence) for sentence in tokenized_sentences]
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences)

    return set(_flat_map(extract_person_names_from_tree(tree)
                         for tree in chunked_sentences))
开发者ID:csojinb,项目名称:name-extractor-api,代码行数:8,代码来源:name_extractor.py


示例13: extract_named_entities

def extract_named_entities(text):
    sentences = nltk.sent_tokenize(text)
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
    entity_names = []
    for tree in chunked_sentences:
        entity_names.extend(extract_entity_names(tree))
    return list(set(entity_names))
开发者ID:dibaunaumh,项目名称:fcs-skateboard,代码行数:9,代码来源:extract_article_concepts.py


示例14: chunked_sentences

def chunked_sentences(text):
    """Splits a large string into chunked sentences [http://www.nltk.org/book/ch07.html#chunking]
    """
    import nltk
    sentences = split_sentences(text)
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
    return chunked_sentences
开发者ID:makalaaneesh,项目名称:newspaper,代码行数:9,代码来源:nlp.py


示例15: name_rec1

def name_rec1(sample):
    sentences = nltk.sent_tokenize(sample)
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
    entity_names = []
    for tree in chunked_sentences:
        entity_names.extend(extract_entity_names(tree))
    return entity_names
开发者ID:Sapphirine,项目名称:Data-Analytics-of-Video-Popularity,代码行数:9,代码来源:NE.py


示例16: analyse_hansard_file

def analyse_hansard_file(filename='House of Representatives_2018_05_10_6091.xml'):
    # Word frequency analysis
    my_abbrev = ['\'m', '.', ',', '\'s', '(', ')', 'n\'t', '\'ve', ';', '$', ':', '\'', '?', '\'ll', '\'re']
    stoplist = set(stopwords.words('english') + my_abbrev)
    soup, sample = parse_hansard(filename)

    # Tokenisation, tagging, chunking
    sent_tokenizer = PunktSentenceTokenizer()
    # Stop breaking sentence at "No."
    sent_tokenizer._params.abbrev_types.add('no')
    #sentences = nltk.sent_tokenize(sample)
    # TODO: improve sentence tokenizer - still far from good
    sentences = sent_tokenizer.tokenize(sample)

    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)

    # Word frequency over all sentences
    tokens = []
    for sentence in tokenized_sentences:
        tokens += [word for word in sentence if word.lower() not in stoplist]
    display_freq(tokens)

    # Part-of-speech analysis
    tags = []
    for sentence in tagged_sentences:
        tags += sentence
    pos_analysis(tags, my_abbrev)

    # spaCy NER
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(sample)
    # Find named entities, phrases and concepts
    ne_spacy = {}
    for entity in doc.ents:
        if entity.label_ in ne_spacy:
            ne_spacy[entity.label_] += [entity.text]
        else:
            ne_spacy[entity.label_] = [entity.text]
    logger.debug("Entity number per type: %s" % {k:len(v) for k,v in ne_spacy.items()})
    for k in ne_spacy.keys():
        display_freq(ne_spacy[k], 'Named entities (%s)' % (k,), top=20)

    # Interjection analysis
    parties = {}
    all_interjections = soup.find_all('interjection')
    for interjection in all_interjections:
        # Can be either a party or a role (Speaker, President, etc, ...)
        party = interjection.party.text or interjection.find('name', role='metadata').text
        if party in parties:
            parties[party] = parties[party] + 1
        else:
            parties[party] = 1
    logger.debug("%s interjections: %s" % (len(all_interjections), parties))
开发者ID:hsenot,项目名称:parliament_of_australia,代码行数:55,代码来源:utils.py


示例17: get_ner_nltk

 def get_ner_nltk(self, text):
   sents = nltk.sent_tokenize(text)  # sentences
   tokenized_sents = [nltk.word_tokenize(s) for s in sents]
   tagged_sents = [nltk.pos_tag(s) for s in tokenized_sents]
   chunked_sents = [x for x in nltk.ne_chunk_sents(tagged_sents)]
   raw = self.traverseTree(chunked_sents)
   ners = {}
   for n in self.entity_cols: ners[n] = []
   [ners[k].append(v.lower()) for k,v in raw]
   for n in self.entity_cols: ners[n] = list(set(ners[n]))
   return ners
开发者ID:Marsan-Ma,项目名称:tnative,代码行数:11,代码来源:ner.py


示例18: nominated_entities

 def nominated_entities(self):
     
     sentences = nltk.sent_tokenize(self.article)
     tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
     tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
     chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
     
     entity_names = []
     for chunked_sentence in chunked_sentences:
         entity_names.extend(self._extract_entity_names(chunked_sentence))
     
     return list(set(entity_names))
开发者ID:annanda,项目名称:nltk,代码行数:12,代码来源:NEExtractor.py


示例19: get_entities3

def get_entities3(text):
  sentences = nltk.sent_tokenize(text)
  tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
  tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
  #chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True)
  chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
  
  entity_names=[]
  for tree in chunked_sentences:
    entity_names.extend(extract_entity_names(tree))

  return filter_entities(entity_names)
开发者ID:bstewartny,项目名称:pnews,代码行数:12,代码来源:feeds.py


示例20: initialize

 def initialize(self, sample):
     sentences = nltk.sent_tokenize(sample)
     tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
     tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
     chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
     entity_names = []
     for tree in chunked_sentences:
         # Print results per sentence
         # print _extract_entity_names(tree)
         
         entity_names.extend(self._extract_entity_names(tree))
     return entity_names
开发者ID:yewwah,项目名称:recruit,代码行数:12,代码来源:ner.py



注:本文中的nltk.ne_chunk_sents函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python nltk.ngrams函数代码示例发布时间:2022-05-27
下一篇:
Python nltk.ne_chunk函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap