• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

Python tokenize.sent_tokenize函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中nltk.tokenize.sent_tokenize函数的典型用法代码示例。如果您正苦于以下问题:Python sent_tokenize函数的具体用法?Python sent_tokenize怎么用?Python sent_tokenize使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了sent_tokenize函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: post

 def post(self):
     args = parser.parse_args()
     text = {'text': args['text']}
     print text
     print sent_tokenize(text['text'])
     print word_tokenize(text['text'])
     return text['text']
开发者ID:lhofer,项目名称:Flask_text_processing_API,代码行数:7,代码来源:api_old.py


示例2: split_sentence_based_on_rules

def split_sentence_based_on_rules(sent):


    if re.search(r' \.+ ', sent):
        sentences = re.split(r' \.+ ', sent)
    elif re.search(r'@ ---- @', sent):
        sentences = re.split(r'@ ---- @', sent)
    elif re.search(r'\.\w+\:', sent):
        sent = re.sub(r'\.(\w+)\:', r'. \1:', sent)
        sentences = sent_tokenize(sent)
    elif re.search(r'\, as well as', sent):
        sent = sent.replace(', as well as', '. As well as')
        sentences = sent_tokenize(sent)
    elif re.search(r'[a-z\.]+[A-Z][a-z]+:', sent):
        k = re.findall(r' [a-z\.]+([A-Z][a-z]+:)', sent)
        p = chr(ord(max(sent)) + 1)
        sentences = sent.replace(k[0], p + k[0]).split(p)
    elif re.search(r'\; ', sent):
        sent = re.sub(r'\; ', r'. ', sent)
        sentences = sent_tokenize(sent)
    elif re.search(r', and, ', sent):
        sent = sent.replace(', and, ', '. And, ')
        sentences = sent_tokenize(sent)
    elif re.search(r'president\: Wechsler', sent):
        sent = sent.replace(': ', '. ')
        sentences = sent_tokenize(sent)
    elif re.search(r'\, ', sent):
        sentences = re.split(r'\, ', sent)
    else:
        sentences = [sent[:349],sent[350:]]
        print("Using greedy sentence tokenization")

    text_len = [len(sentence) for sentence in sentences]
    return sentences
开发者ID:bethard,项目名称:timenorm,代码行数:34,代码来源:preprocess_functions.py


示例3: load_file_sentences

def load_file_sentences(filepath):
    index = filepath.rfind('/')
    if index < 0:
        sents = sent_tokenize(PlaintextCorpusReader('.', filepath).raw())
    else:
        sents = sent_tokenize(PlaintextCorpusReader(filepath[:index], filepath[index+1:]).raw())
    return sents
开发者ID:tkoane,项目名称:stat471project,代码行数:7,代码来源:stat_code.py


示例4: realtime

def realtime():
    model_parsing()
    data_df=pd.read_csv('Test_Survey.csv')
    data_df.Verbatim=data_df.Verbatim.fillna(0)
    unique_id=data_df['Unique_Id']
    verbatims=data_df['Verbatim']
    data_dict = dict(zip(unique_id, verbatims))
    Results_df=pd.DataFrame(columns=('Unique_id','Sentence', 'category', 'Sentiment'))
    model_df = pd.read_csv('Model_modified_twitter_test.csv')
    for uid,line in data_dict.items(): 
        line=str(line).decode('utf-8',errors='ignore') #To make sure program doesnt run into unicode error. Add errot handling to avoid issues with other formats
        try:
            line_list=tokenize.sent_tokenize(str(line))
            tokenize.sent_tokenize(str(line))
            for line in line_list:
                original_line=line
                for p in list(punctuation):
                    line=line.replace(p,'')
                line=line.lower()
                line_SC=tb.blob.BaseBlob(line)
                line=line_SC.correct()
                line=str(line)
                #print uid
                sentiment_score=sentiment_calc(line)
                
                temp_df=core_classify(line,uid,sentiment_score,model_df,original_line)
                #Results_df = Results_df.append(temp_df)
                
                yield temp_df
        except UnicodeEncodeError:
            temp_df = pd.DataFrame({'Unique_id':[uid],'Sentence':[original_line],'category':['Invalid text data'],'Sentiment':[sentiment_score]})
            yield temp_df
            #Results_df = Results_df.append(temp_df)
    Results_df.to_csv('test_analysis.csv',index=False, encoding = 'utf-8')
开发者ID:ght438,项目名称:Loki,代码行数:34,代码来源:Text_analytics_gui.py


示例5: process_statuses

def process_statuses(uid):
	statuses_list = {}
	in_path = 'Data/'+uid+'/statuses_list.pickle'
	if os.path.exists(in_path):
		f = open(in_path,'rb')
		j = 0
		while True:
			try:
				statuses = pickle.load(f)
				for status in statuses:
					j += 1
					tweet = status.text
					if 

					sents = sent_tokenize(tweet)
					text = ""
					for sent in sents:
						#print("Sent: ", sent)
						sent_text = re.sub(r'RT\[email protected]\w+:\s|@\w+\s|#|http://.*$|http://.*\s|https://.*$|https://.*\s|\n|\\U\w+', "", sent)
						sent_text = highpoints.sub("", sent_text)
						#print(sent_text)
						tokens = word_tokenize(sent_text)
						words = [w.lower() for w in tokens if w.isalpha() or w.isalnum()]
						stop_words = set(stopwords.words('english'))
						filtered_words = [w for w in words if not w in stop_words]
						statuses_list[sent] = filtered_words	#structure: key:integrate sentence, value: filtered_words 
			except EOFError:
				print(j)
				break
	#print("statuses_list: ", statuses_list)
	return statuses_list 
开发者ID:QiaozhiWang,项目名称:Sensitive_tweets,代码行数:31,代码来源:extract_sentweet-seperate.py


示例6: sentences

def sentences(a, b):
    """Return sentences in both a and b"""
    asplit = sent_tokenize(a)
    bsplit = sent_tokenize(b)
    # use set again
    same = {x for x in asplit if x in bsplit}
    return list(same)
开发者ID:dillon,项目名称:cs50,代码行数:7,代码来源:helpers.py


示例7: embed

def embed(sentences):
    model = word2vec.load('~/word2vec_models/GoogleNews-vectors-negative300.bin')
    embedded_sentences = []
    tokenized_sentences = []

    max_len = 0
    for sentence in sentences:
        tokenized_sentence = sent_tokenize(sentence)
        tokenized_sentences.append(tokenized_sentence)
        if len(tokenized_sentence) > max_len:
            max_len = len(tokenized_sentence)


    for sentence in sentences:
        tokenized_sentence = sent_tokenize(sentence)
        embedded_words = []
        
        for word in tokenized_sentence:
            try:
                word = model['word']
            except:
                word = np.zeros(300)
            embedded_words.append(word)

        #padding    
        for i in range(max_len - len(embedded_words)):
            embedded_words.append(np.zeros(300))

        embedded_sentences.append(embedded_words)

    embedded_sentences = np.array(embedded_sentences)

    return embedded_sentences
开发者ID:RemedyHealthcare,项目名称:cnn-text-classification-tf,代码行数:33,代码来源:data_helpers.py


示例8: split_reddit_reviews

    def split_reddit_reviews(self,reviews):
        columns = ['Text','Score', 'True']
        #Calculate total number of sentences to fill up the data frame
        count=0
        for index,each_review in reviews.iterrows():

            split_sentences=sent_tokenize(each_review['Text'])
            count+=len(split_sentences)
        print "total number of sentences {}".format(count)

        df = pd.DataFrame(index=range(0,count), columns=columns)
        Text,Score,True=[],[],[]
        for index,each_review in reviews.iterrows():
            split_sentences=sent_tokenize(each_review['Text'])
            actual_tag=each_review['True']
            score_tag=each_review['Score']
            for each_split_sentence in split_sentences:
                Text.append(each_split_sentence)
                Score.append(actual_tag)
                True.append(score_tag)
        print "Count ={} Text.length {}".format(count,len(Text))
        df['Text']=Text
        df['Score']=Score
        df['True']=True
        df.to_csv('../data/reddit_reviews.csv')
开发者ID:pratheeksh,项目名称:Sarcasm-detection-project,代码行数:25,代码来源:sarcasmClassify.py


示例9: inputfactx

def inputfactx(rev, include_vpr):
    this_business = find_business(rev.bizid)
    this_user = find_user(rev.uid)
    result = [ this_business.stars ]
    if include_vpr:
        result += [ this_user.get_vpr() ]
    result += [
        this_user.reviewCount,
        len(rev.text),
        rev.stars,
        rev.get_days() ]
    if len(rev.text) == 0:
        result += [ 0, 0, 0, 0, 0 ]
    else:
        excount = 0
        for sent in sent_tokenize(rev.text):
            ss = sent.strip()
            if ss.endswith('!'):
                excount += 1
        result += [ excount,
        np.mean([len(sent) for sent in sent_tokenize(rev.text)]),
        len(sent_tokenize(rev.text)),
        len(re.findall('\n\n', rev.text)) + 1,
        len(rev.text.splitlines()[0]) ]
    result += [ this_business.longitude, this_business.latitude ]
    return result
开发者ID:jingjingh,项目名称:kaggle_yelp,代码行数:26,代码来源:LR.py


示例10: tokenize_sentences

def tokenize_sentences(filename):
	file_dir = docs_dir + str(filename)
	f = open(file_dir, 'r')

	root = ET.parse(f).getroot()
	tags = root.getiterator('str')

	# read the relevant tags
	title_string = ''
	desc_string = ''
	for tag in tags:
		if tag.get('name')  == 'Title' :
			title_string = filter(lambda x: x in string.printable, tag.text.lower().strip())

		elif tag.get('name') == 'Abstract':
			desc_string = filter(lambda x: x in string.printable, tag.text.lower().strip().replace('relevant documents will describe', ''))

	f.close()

	sentences = sent_tokenize(title_string)
	title_words = []
	for s in sentences:
		title_words = title_words + word_tokenize(s)

	sentences = sent_tokenize(desc_string)
	desc_words = []
	for s in sentences:
		desc_words = desc_words + word_tokenize(s)

	
	return (title_words, desc_words)
开发者ID:Tiotao,项目名称:CS3245HW4,代码行数:31,代码来源:index.py


示例11: tokenize

def tokenize(text, grams=1):
  wordStems = lambda s: map(stem, word_tokenize(s))
  sentTokens = lambda tok, s: tok + wordStems(s)

  if grams == 1:
    return list(reduce(sentTokens, sent_tokenize(text), [ ]))
  else:
    return list(ngrams(reduce(sentTokens, sent_tokenize(text), [ ]), grams))
开发者ID:nithinkrishna,项目名称:tamil-text-summarization,代码行数:8,代码来源:main.py


示例12: main

def main(param = 0):
    ''' 
    0 for no stem
    1 for porter
    2 for lancaster
    '''
    both_pos_index = {}
    tit_pos_index = {}
    abs_pos_index = {}


    if param == 0:
        path = './NoStemmer/'
    elif param == 1:
        path = './Porter/'
    elif param == 2:
        path = './Lancaster/'

    for i in range(1,1001):
            
        '''open xml file and get abstract and title'''
        try: 
            filename = "./data/%d.xml" %i
            data = open(filename)
        except:
            print "can't open file %s" %filename
            return 0

        docid = filename.split('/')[-1].split('.')[-2]
        
        tree = etree.fromstring(data.read())
    
        title = tree.find('Title').text
        abstract =  tree.find('Abstract').text
    
    
        #####################################################
        # Step2 tokenize and make position index dictionary #
        #####################################################
        '''sentence tokenize'''
        if title != None:
            title = title.replace('[','',1).replace(']','',1)
            titles = [s.replace('&amp;', '') for s in sent_tokenize(title)]
            tit_pos_index = position_index(tit_pos_index, titles, docid, param) 
           
        if abstract != None:
            abstracts = [s.replace('&amp;', '&') for s in sent_tokenize(abstract)] 
            both = titles + abstracts
        else:
            both = titles
            
        both_pos_index = position_index(both_pos_index,both,docid, param)
    '''save position idex to json'''
    
    with codecs.open( './' + path.split('/')[1] + '_both_index' + '.json', mode = 'w') as a:
        json.dump(both_pos_index, a)
开发者ID:tancc,项目名称:search-practice,代码行数:56,代码来源:generate_index.py


示例13: tag_words_by_sentence

def tag_words_by_sentence(input_filename, output_path=''):
#    text = get_file_text(input_filename)
    text = 'Every day I see blue. But the sky is red. Eagles are green'
    sentences = sent_tokenize(text)
#    sentences = sent_tokenize(text)
    word_tokens = [word_tokenize(s) for s in sent_tokenize(text)]
#    word_tokens = nltk.tag.batch_pos_tag(sent_tokenize(text))
    word_pos = nltk.tag.batch_pos_tag(word_tokens)

        
    return
开发者ID:bchoatejr,项目名称:religion,代码行数:11,代码来源:nlp_word_tools.py


示例14: sentences

def sentences(a, b):
    """Return sentences in both a and b"""
    a1 = set(sent_tokenize(a))
    b1 = set(sent_tokenize(b))
    ans = []

    for line in a1:
        if line in b1:
            ans.append(line)

    return ans
开发者ID:AadeshSalecha,项目名称:Intro-to-Computer-Science-CS50-,代码行数:11,代码来源:helpers.py


示例15: lexical_features

    def lexical_features(self):
        """ Lexical features
        """
        features = []
        # Add the first token from the top-1st span on stack
        if self.stackspan1 is not None:
            text = self.stackspan1.text
            texts1 = word_tokenize(text)
          #  print texts1
            sent_tokenize_list =sent_tokenize(text)
            wordb = word_tokenize(sent_tokenize_list[0] )
            worde = word_tokenize(sent_tokenize_list[-1] )
       #     print wordb[0]
            features.append(('StackSpan1','BEGIN-WORD-STACK1',wordb[0].lower()))
            features.append(('StackSpan1','BEGIN-END-STACK1',worde[-1].lower()))
            features.append(('StackSpan1','BEGIN-END-WORD-STACK1',wordb[0].lower(),worde[-1].lower()))


        if self.stackspan2 is not None:
            text = self.stackspan2.text
            texts2 = word_tokenize(text)
          #  print texts1
            sent_tokenize_list =sent_tokenize(text)
            wordb = word_tokenize(sent_tokenize_list[0] )
            worde = word_tokenize(sent_tokenize_list[-1] )
       #     print wordb[0]
            features.append(('StackSpan2','BEGIN-WORD-STACK2',wordb[0].lower()))
            features.append(('StackSpan2','BEGIN-END-STACK2',worde[-1].lower()))

        if self.queuespan1 is not None:
            text = self.queuespan1.text
            textq1 = word_tokenize(text)
          #  print texts1
            sent_tokenize_list =sent_tokenize(text)
            wordb = word_tokenize(sent_tokenize_list[0] )
            worde = word_tokenize(sent_tokenize_list[-1] )
       #     print wordb[0]
            features.append(('QueueSpan1','BEGIN-WORD-QUEUE1',wordb[0].lower()))
            features.append(('QueueSpan1','BEGIN-END-QUEUE',worde[-1].lower()))
            features.append(('QueueSpan1','BEGIN-END-WORD-QUEUE1',wordb[0].lower(),worde[-1].lower()))


        if self.stackspan2 is not None and self.stackspan1 is not None:
             features.append(('StackSpan1','LENGTH-STACK1-STACK2',len(texts1),len(texts2)))
        if self.queuespan1 is not None and self.stackspan1 is not None :

            features.append(('StackSpan1','LENGTH-STACK1-QUEUE1',len(texts1),len(textq1)))
       #     features.append(('StackSpan1','POS-START-STACK1-QUEUE1',begins1,beginq1))

        for feat in features:
            yield feat
开发者ID:parry2403,项目名称:CodeRepo,代码行数:51,代码来源:feature.py


示例16: training_ner

	def training_ner(self, paragraph, classification):
		sentence = sent_tokenize(paragraph)
		#print paragraph
		
		#result = []
		train = []
		sentence_ne = ""
		# 1. Pemecahan paragraf kedalam kalimat
		for index, data in enumerate(sentence):	
			tokenize = word_tokenize(data)
			div_sentence = []
			for word in tokenize:
				#check_kota = len(list(self.db.cities.find({"kota":re.compile("^"+word+"$", re.IGNORECASE)})))>=1
				check_kota = (self.db.location.find({"$text": {"$search": word.lower()}}).count())>=1
				# print "word : %s, check : %s"%(word,check_kota) 
				if not check_kota:
					#apabila kata bukan kota maka dibuat kata dasar
					sent_stem = self.stemmer.stem(word)
					word = sent_stem
				div_sentence.append(word)
			train.append(" ".join(div_sentence))
			#ket parameter : self.div_sentence_ner(kalimat_dengan_kata_dasar, kalimat_asli, jenis_klasifikasi) 
			sentence_ne = self.div_sentence_ner("".join(train), " ".join(tokenize), classification)
			#result.append(sentence_ne)
			#reset array train agar tidak diikutkan training ner
			train = []

		return sentence_ne
开发者ID:pwcahyo,项目名称:ner_maxent,代码行数:28,代码来源:maxent.py


示例17: make_sentences

    def make_sentences(self):

        """
        Makes sentences from raw documents.
        Each sentence is wrapped up in a sentence class
        :return: None
        """

        # Create parameters for NER and Dependency Parsing a
        # and pass it to the sentence objcet

        # set config file
        config = CP.RawConfigParser()
        config = config
        config.read('config.py')
         # Server for dependency parsing
        server = ServerProxy(JsonRpc20(),TransportTcpIp(addr=("127.0.0.1", 8080), timeout=200.0))

        # Parameters for Named entitye recognition

        # get the classifier and tagger location from config file
        tagger = config.get('NER','tagger') # gets the path of the stanford tagger
        classifier = config.get('NER','classifier') # gets the path of the stanford classifier
        st = StanfordNERTagger(classifier,tagger)


        if self.document == None:
            return

        sent = sent_tokenize(self.document) # contains raw sentences
        for i in range(len(sent)):
            s = Sentence(sent[i],i, server, st) # We also pass the server object and nertagger
            self.sentences.append(s)
开发者ID:vignesh117,项目名称:MusicalText,代码行数:33,代码来源:TrainDocument.py


示例18: __init__

 def __init__(self, content, remove_punct=True):
     self._tokcont = [word_tokenize(s) for s in sent_tokenize(content)]
     if remove_punct:
         self._tokcont = [[w for w in s if w not in punctuation]
                          for s in self._tokcont[:]]
     # Remove zero-length sentence
     self._tokcont = [s for s in self._tokcont[:] if len(s) > 0]
开发者ID:kemskems,项目名称:otdet,代码行数:7,代码来源:feature_extraction.py


示例19: markovize

def markovize(word1, word2, word3, fileid, char_limit=None):   
    
    with open(fileid, encoding='utf-8') as f:
        text = f.read()
    
    sentences = sent_tokenize(text)
    sent_tokens = defaultdict(list)
    for sentence in sentences:
        tokens = re.findall(r"[\w']+|[.,?!:;]", sentence)
        nwise_ = nwise(tokens, n=4)
        if nwise_:
            for token1, token2, token3, token4 in nwise_:
                sent_tokens[token1, token2, token3].append(token4)
    
    too_long = True
    
    while too_long:
        sentence = [word1, word2, word3]
    
        utterance = build_sentence(sentence, sent_tokens)
        len_utterance = len(utterance)
         
        if char_limit != None and len_utterance > char_limit:
            too_long = True
        else:
            too_long = False
            
    return utterance
开发者ID:timothybeal,项目名称:kjvbot_tweeter,代码行数:28,代码来源:markovbot.py


示例20: line_to_sentences

def line_to_sentences(line):
    raw_sentences = sent_tokenize(line.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(w2v_normalize(raw_sentence))
    return sentences
开发者ID:IlyaGusev,项目名称:nlp-practice,代码行数:7,代码来源:w2v_model.py



注:本文中的nltk.tokenize.sent_tokenize函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python tokenize.word_tokenize函数代码示例发布时间:2022-05-27
下一篇:
Python tokenize.regexp_tokenize函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap