• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

Python nltk.regexp_tokenize函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中nltk.regexp_tokenize函数的典型用法代码示例。如果您正苦于以下问题:Python regexp_tokenize函数的具体用法?Python regexp_tokenize怎么用?Python regexp_tokenize使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了regexp_tokenize函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: get_freqs

def get_freqs(text):

    stop_words = nltk.corpus.stopwords.words('english')
    frequencies = defaultdict(int)

    pattern = r'''(?x)    # set flag to allow verbose regexps
                    ([A-Z]\.)+        # abbreviations, e.g. U.S.A.
                    | \w+(-\w+)*        # words with optional internal hyphens
                    | \$?\d+(\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
                    | \.\.\.            # ellipsis
                    | [][.,;"'?():-_`]  # these are separate tokens
                     '''

    if type(text) == list:
        print 'number of links: '+ str(len(text))
        for t in text:
            content = t['content']
            tokens = nltk.regexp_tokenize(content, pattern)
            for word in tokens:
                if len(word) > 2 and word.lower() not in stop_words:
                    cap = word[0].upper() + word[1:]
                    frequencies[cap] += 1
    else:
        tokens = nltk.regexp_tokenize(text, pattern)
        for word in tokens:
            if len(word) > 2 and word not in stop_words:
                frequencies[word] += 1
    print "frequency size: "+str(len(frequencies))
    return frequencies
开发者ID:seemless,项目名称:chainlink,代码行数:29,代码来源:chainlink_util.py


示例2: bag_of_words

def bag_of_words(data, label_codebook, feature_codebook, theta):
    """"""
    word_dict = Alphabet()
    stopset = set(stopwords.words('english'))
    for key, value in data.items():
        label_codebook.add(key)
        for doc in value:
            doc_tokens = set(nltk.regexp_tokenize(doc, pattern="\w+"))
            for word in doc_tokens:
                if word not in stopset:
                    word_dict.add(word)
                    
    all_words = word_dict._label_to_index.keys()
    fdict = FreqDist([w for w in all_words])
    word_feature = fdict.keys()[theta:]
    for word in all_words:
        if word in word_feature:
            feature_codebook.add(word)
    
    instance_list = {}
    for label, document_list in data.items():
        instance_list[label] = []
        for document in document_list:
            vector = np.zeros(feature_codebook.size())
            tokens = set(nltk.regexp_tokenize(document, pattern="\w+"))
            indice = 0
            
            for word in tokens:
                if feature_codebook.has_label(word):
                    indice = feature_codebook.get_index(word)
                    vector[indice] = 1.
            instance_list[label].append(vector)
    return instance_list
开发者ID:Juicechuan,项目名称:workspace,代码行数:33,代码来源:naive_bayes.py


示例3: load

def load(f=str):
    import re
    files = open(f)
    raw = files.read()
    pattern = re.compile(r"""\$?\d+(\.\d+)?%?    # currency
                             \d+/\d+/\d+         #dates""", re.VERBOSE)
    nltk.regexp_tokenize(raw, pattern)
开发者ID:MariaSpyropoulou,项目名称:NLTK-Book,代码行数:7,代码来源:Chapter3.py


示例4: nltkTest

def nltkTest():
    s = "russia licenza 8.1.5 U.S."
    res = nltk.regexp_tokenize(s, helper.nltkPattern)
    print(res)

    s = "Saldo vs. Fattura n. 2015/004"
    res = nltk.regexp_tokenize(s, helper.nltkPattern)
    print(res)
开发者ID:cynricshu,项目名称:ChinaVis2016,代码行数:8,代码来源:handleSubject.py


示例5: regularExpressionTokenizer

def regularExpressionTokenizer():
    text = 'That U.S.A. poster-print costs $12.40...'
    pattern = r'''(?x)         # set flag to allow verbose regexps 
            ([A-Z]\.)+        # abbreviations, e.g. U.S.A. 
          | \w+(-\w+)*        # words with optional internal hyphens 
          | \$?\d+(\.\d+)?%?  # currency and percentages, e.g. $12.40, 82% 
          | \.\.\.            # ellipsis 
          | [][.,;"'?():-_`]  # these are separate tokens 
    '''
    print nltk.regexp_tokenize(text, pattern)
开发者ID:hbdhj,项目名称:python,代码行数:10,代码来源:chapter3.py


示例6: get_links

def get_links(text):
    # checks only for  'http://...' and 'www...'
    text = text + " "
    pat = "http://.*?\s"
    links = nltk.regexp_tokenize(text, pat)
    text = " " + text + " "
    pat = "\swww\..*?\..*?\s"
    links.extend(nltk.regexp_tokenize(text, pat))
    links = map(lambda x: x[:-1], links)
    return links
开发者ID:ItsLastDay,项目名称:Twitter-language-identification,代码行数:10,代码来源:string_processing.py


示例7: poss_test

def poss_test(test_file,test_write,sw_file):
    """
    
    Arguments:
    - `train_file`:
    """
    a = 0
    f = open(test_file)
    reader = csv.reader(f)

    t = open(test_write,"w")

    sw = open(sw_file)
    sw = sw.readlines()
    sw = [word.strip() for word in sw]
    
    stopwords = sw
    print "停顿词表长度",len(stopwords)
    stopwords = set(stopwords)

    g = lambda x : x not in stopwords
    
    for row in reader:
        if a == 0:
            a += 1
            continue
        if a%1000 == 0:
            print a    
        a += 1
        #if a == 8:
        #    sys.exit(1)

        title = row[1].lower()
        #clean html
        body = nltk.clean_html(row[2].lower())
        
        #work tokenize
        pattern = r"([a-z])\w+"
        body = nltk.regexp_tokenize(body, pattern)
        title = nltk.regexp_tokenize(title, pattern)

        #light stem
        #title = set([stem(word) for word in title])
        #body = set(body)
        #body = set([stem(word) for word in body])

        #remove stopwords
        #body = filter(g,body)
        #title = filter(g,title)

        body = ' '.join(body)
        title = ' '.join(title)
        t.write('%s , %s \n'%(title,body))
开发者ID:rve,项目名称:keyword,代码行数:53,代码来源:stem.py


示例8: poss_test

def poss_test(test_file,test_write,sw_file):
    """
    
    Arguments:
    - `train_file`:
    """
    a = 0
    f = open(test_file)
    reader = csv.reader(f)

    t = open(test_write,"w")

    sw = open(sw_file)
    sw = sw.readlines()
    sw = [word.strip() for word in sw]
    
    #stopwords = sw 
    stopwords = nltk.corpus.stopwords.words('english')
    stopwords = set(stopwords)

    g = lambda x : x not in stopwords
    
    for row in reader:

        if a%10000 == 0:
            print(a)
        a += 1
        #if a == 8:
        #    sys.exit(1)

        title = row[1].lower()
        #clean html
        body = nltk.clean_html(row[2].lower())
        
        #work tokenize
        pattern = r"(\.?[a-z][a-z0-9\+\.\#\-]+[a-z0-9\+\#])"
        body = nltk.regexp_tokenize(body, pattern)
        title = nltk.regexp_tokenize(title, pattern)
        #remove stopwords
        body = filter(g,body)
        title = filter(g,title)

        #light stem
        title = set([stem(word) for word in title])
        body = set(body)
        body = set([stem(word) for word in body])


        body = ' '.join(body)
        title = ' '.join(title)
        t.write('"%s","%s","%s"\n'%(row[0],title,body))
开发者ID:rve,项目名称:keyword,代码行数:51,代码来源:nltk_without_stem.py


示例9: query_episode

    def query_episode(self, show_title, 
        ep_title, se_number, ep_number, runtime):
        """build video list prior to scoring
        """
        qres = {}

        # Query 1
        qlist = (show_title, ep_title)
        # Search YouTube
        tmp = self.search('%s %s' % qlist)
        for k, v in tmp.items():
            qres[k] = v
        # Query 2
        qlist = (show_title, ep_title, 
            se_number, ep_number)
        # Search YouTube
        tmp = self.search('%s %s  %s  %s' % qlist)
        for k, v in tmp.items():
            qres[k] = v
        # Query 3
        qlist = (show_title, 
            se_number, ep_number)
        # Search YouTube
        tmp = self.search('%s s%02de%02d' % qlist)
        for k, v in tmp.items():
            qres[k] = v

        # Show tokens
        sh_stem = [self._lancaster.stem(t) \
            for t in nltk.regexp_tokenize(
                show_title.encode('utf8'), r"\w+")]

        # Episode stem tokens if exist
        if ep_title:
            ep_stem = [self._lancaster.stem(t) \
                for t in nltk.regexp_tokenize(
                    ep_title.encode('utf8'), r"\w+")]
        else:
            ep_stem = None

        res = {'Output': qres, 
               'Input': {},}
        res['Input']['show_title'] = show_title
        res['Input']['ep_title'] = ep_title
        res['Input']['sh_stem'] = sh_stem
        res['Input']['ep_stem'] = ep_stem
        res['Input']['se_number'] = se_number
        res['Input']['ep_number'] = ep_number
        res['Input']['runtime'] = runtime

        return res
开发者ID:BrianDurham,项目名称:couchtube,代码行数:51,代码来源:ytquery.py


示例10: poss_train

def poss_train(train_file,train_write,sw_file):
    """
    
    Arguments:
    - `train_file`:
    """
    a = 0
    f = open(train_file)
    reader = csv.reader(f)

    t = open(train_write,"w")

    sw = open(sw_file)
    sw = sw.readlines()
    sw = [word.strip() for word in sw]
    
    #stopwords = sw  # use nltk stopwords
    stopwords = nltk.corpus.stopwords.words('english')
    print "停顿词表长度",len(stopwords)
    stopwords = set(stopwords)

    g = lambda x : x not in stopwords
    
    for row in reader:
        if a%100000 == 0:
            print a    
        a += 1
        title = row[1].lower()
        #clean html
        body = nltk.clean_html(row[2].lower())
        
        #word tokenize
        pattern = r"([a-z])\w+"
        body = nltk.regexp_tokenize(body, pattern)
        title = nltk.regexp_tokenize(title, pattern)
        
        #remove stopwords
        body = filter(g,body)
        title = filter(g,title)

        #light stem
        #st = LancasterStemmer()
        title = set([stem(word) for word in title])
        body = set(body)
        body = set([stem(word) for word in body])

        # list to string
        body = ' '.join(body)
        title = ' '.join(title)
        t.write('"%s","%s","%s","%s"\n'%(row[0], title,body,row[3]))
开发者ID:rve,项目名称:keyword,代码行数:50,代码来源:pre_nltk.py


示例11: normalized

def normalized(text, lowercase=True, fix=True, tuples=False):
    """Tokenize, remove capitalization and exclude punctuation
    """
    if fix:
        text = fix_text(unicode(text))
    pattern = r"""(?x)    # verbose regexps
        \w+(-\w+)*        # words with optional internal hyphens
    """
    result = [w for w in nltk.regexp_tokenize(text, pattern)]
    if lowercase:
        result = [w.lower() for w in nltk.regexp_tokenize(text, pattern)]
    if tuples:
        result = tuple(result)
    return result
开发者ID:elyase,项目名称:eikon_challenge,代码行数:14,代码来源:utils.py


示例12: compute_df

 def compute_df(self, document_list):
     '''Compute document frequency based on input document list'''  
     df_cache = dict()
     df_output = dict()
     
     d_index = 0
     for document in document_list:
         d_index += 1
         # tokenize each document
         reg_toks = nltk.regexp_tokenize(document, SENTENCE_RE)
         for item in reg_toks:
             # change each word to lower case and lemmatize
             item = normalise(item)
             if item not in df_cache:
                 df_cache[item] = set([d_index])
             else:
                 df_cache[item].add(d_index)
     
     for item in df_cache:
         if acceptable_word(item):
             df_output[item] = len(df_cache[item])
     
     df_output['total_document'] = len(document_list)
     
     return df_output
开发者ID:luotigerlsx,项目名称:DataAnalysis_ML,代码行数:25,代码来源:keyword_extract.py


示例13: main

    def main(self, text):
        """Breaks a single string into a tree using the grammar and returns
        the specified words as a string."""

        if text is None:
            return None

        try:
            text = text.encode("ascii", "ignore")
        except:
            text = text.decode("utf-8", "ignore").encode("ascii", "ignore")

        chunker = nltk.RegexpParser(grammar)

        toks = nltk.regexp_tokenize(text, sentence_re)
        postoks = nltk.tag.pos_tag(toks)

        #print postoks
        tree = chunker.parse(postoks)

        terms = self.get_terms(tree)

        words = self.get_words(terms)

        return words
开发者ID:hongyu89,项目名称:IndeedScraper,代码行数:25,代码来源:GrammarParser.py


示例14: generate_vocab

def generate_vocab(papers):
    """Returns the vocabulary used in the papers given in parameters, after cleaning and stopwords removal.

    Args:
        papers (list of tuples): the raw list of papers from which generates the vocabulary (each element is a tuple of 3 strings: id, title and abstract)

    Returns:
        list of strings: the list of tokens forming the vocabulary
    """
    sc = StringCleaner()

    # Generate author's vocabulary
    corpus = " ".join(p[1] + " " + p[2] for p in papers)
    # Cleaning
    corpus = sc.clean_string(corpus)
    # Tokenization
    pattern = r"(?:[A-Z]\.)+|\w+(?:-\w+)*|\d+(?:\.\d+)?%?"
    #         we keep tokens that are words (with optional internal hyphens), acronyms and percentages
    tokens = set(nltk.regexp_tokenize(corpus, pattern)) - set(nltk.corpus.stopwords.words("english"))
    num_re = re.compile("^\d+$")
    tokens = set([t for t in tokens if not num_re.match(t)]) # we remove only-numeric tokens
    # Stemming
    porter = nltk.stem.PorterStemmer()

    return [porter.stem(t) for t in tokens]
开发者ID:tizot,项目名称:recom-system,代码行数:25,代码来源:dataset_tools.py


示例15: extract

 def extract(self, text):
     ''' Extract and freudify noun phrases from text, return all succesfully
     freudified noun phrases. '''
     
     toks = nltk.regexp_tokenize(text, self.sentence_re)
     postoks = nltk.tag.pos_tag(toks)
     tree = self.chunker.parse(postoks)
     terms = self._get_terms(tree)
     
     phrases = sets.Set()
     
     # Loop through all the noun phrases and try to freudify them.
     for term in terms:
         if (len(term)) < 2: continue
         changed = False
         context = ""
         phrase = []
         for part in term:
             word, tag = part
             word = word.encode('ascii', 'replace')
             phrase.append(word.lower())
             rpl = self.replace_word(tag[:2], word)
             if len(rpl[2]) > 0:
                 context = rpl[2]
                 phrase[-1] = rpl[0]
                 changed = True
         if changed:
             phrase = " ".join(phrase).strip()
             phrase.encode('ascii', 'replace')
             phrase = str(phrase)
             if phrase not in self.own_phrases[context]:
                 phrases.add((str(phrase), context))    
       
     phrases = list(phrases)      
     return phrases
开发者ID:assamite,项目名称:agentwordgame,代码行数:35,代码来源:freud.py


示例16: ShowCollocations

def ShowCollocations():
	text.insert(END, "If this doesn't work, please check you have NLTK, PyYAML and the stopword list from the NLTK loaded. See Help for details \n\n\n")
	import nltk
	from nltk.collocations import BigramCollocationFinder
	from nltk.collocations import TrigramCollocationFinder
	from nltk.metrics import BigramAssocMeasures
	from nltk.metrics import TrigramAssocMeasures
	pattern = r'''(?x)([A-Z]\.)+|\w+([-']\w+)*|\$?\d+(\.\d+)?%?|\.\.\.|[][.,;"'?():-_']'''
	data = resultsbox.get(1.0,END)
	rawtext=nltk.regexp_tokenize(data, pattern)
	prepcolloc = [word.lower() for word in rawtext if not word in stopwords and word.isalpha()]
	text.delete(1.0, END)
	text.insert(END, "Collocations (occurring at least 3 times with a PMI of 10)\n")
	text.insert(END, "\nBigram Collocations:\n")
	bigram = BigramAssocMeasures()
	bigramfinder = BigramCollocationFinder.from_words(prepcolloc)
	bigramfinder.apply_freq_filter (3)
	bigrams=bigramfinder.nbest(bigram.pmi, 10)
	for item in bigrams:
		first = item[0]
		second = item[1]
		text.insert(END, first)
		text.insert(END, " ")
		text.insert(END, second)
		text.insert(END, "\n")
开发者ID:muranava,项目名称:Text-Tools,代码行数:25,代码来源:collocationreadability.py


示例17: word_couple_con_puntuacion_pares_minusculas

def word_couple_con_puntuacion_pares_minusculas(lista):
    word_couples = []
    
    
    regexp = "[a-zA-Z'ÁÉÍÓÚáéíóúñÑüÜ]+-*[a-zA-Z'ÁÉÍÓÚáéíóúñÑüÜ]+|[a-zA-Z'ÁÉÍÓÚáéíóúñÑüÜ]+|[.]+|[/,$?:;!()&%#=+{}*~.]+|[0-9]+"
    
    for oracion in lista:
        
        #oracion = str(oracion)
        #oracion = oracion.to_lower
        #print oracion
        
        
        tokens = nltk.regexp_tokenize(oracion.lower(), regexp)
        #print len(tokens)
        
#         tokens_lower = []
#         for i in range(len(tokens)):
#             palabra = str(tokens[i])
#             tokens_lower.append(palabra.to_lower() )          
            
        
        pairs = list(itertools.permutations(tokens, 2))
        for pair in pairs:
            word_couples.append(pair[0]+"~"+pair[1])
        
    return word_couples
开发者ID:jesusmiguelgarcia,项目名称:FSTmikes,代码行数:27,代码来源:attr_util_mk.py


示例18: handle_doc

def handle_doc(word_set,rs_path):
    doc_dir = os.listdir(rs_path)
    doc_matrix = []
    doc_cat = []
    for docs in doc_dir:
        files = os.listdir(rs_path+docs)
        print "start to handle the -->  "+docs
        for file_d in files:
            d_path = rs_path+docs+'/'+file_d
            #get the single file path
            with open(d_path,'rb') as text_file:
                str_tmp = ''
                file_lines = text_file.readlines()
                for line in file_lines:
                    pattern = r'''[a-zA-Z]+'''
                    tokens = nltk.regexp_tokenize(line,pattern)
                    for t in tokens:
                        if t.lower() in word_set:
                            str_tmp += t.lower()
                            str_tmp += ' '
                doc_matrix.append(str_tmp)
                doc_cat.append(cat_dic[docs])
            text_file.close()
    str_tmp = ''
    for sw in word_set:
        str_tmp += sw
        str_tmp += ' '
    doc_matrix.append(str_tmp)
    doc_cat.append('NAN')
    vectorizer = CountVectorizer()
    doc_num = vectorizer.fit_transform(doc_matrix)
    tfidf = TfidfTransformer()
    doc_tfidf = tfidf.fit_transform(doc_num)
    return doc_tfidf[:-1,:],doc_cat[:-1]
开发者ID:CharLLCH,项目名称:work-for-py,代码行数:34,代码来源:adjust_word.py


示例19: longitud_promedio_palabras_moens

def longitud_promedio_palabras_moens(lista):
    regexp = "[a-zA-Z'ÁÉÍÓÚáéíóúñÑüÜ]+"
    total_palabras_en_oraciones = 0
    num_oraciones = 0
    tokens = 0
    promedio_longitud_palabras_oraciones = []
    for oracion in lista:
        total_palabras_oracion = 0
        num_palabras_oracion = 0
        tokens = nltk.regexp_tokenize(oracion, regexp)
        total_palabras_en_oraciones += len(tokens)
        for palabra in tokens:
            total_palabras_oracion += len(palabra)
            num_palabras_oracion += 1
            #print palabra
            #print len(palabra)
        if total_palabras_oracion > 0:
            promedio_longitud_palabras_oraciones.append(total_palabras_oracion/num_palabras_oracion)
        else:
            print oracion
        #print len(tokens)
        #total += len(oracion.split())
        num_oraciones += 1
    #promedio = total_palabras_en_oraciones / num_oraciones
    #print promedio_longitud_palabras_oraciones
    suma_promedios=0
    num_promedios = 0
    for promedios in promedio_longitud_palabras_oraciones:
        suma_promedios += promedios
        num_promedios += 1
    promedio = suma_promedios/num_promedios
        
    #promedio = sum(promedio_longitud_palabras_oraciones)/float(len(promedio_longitud_palabras_oraciones))    
    return promedio
开发者ID:jesusmiguelgarcia,项目名称:FSTmikes,代码行数:34,代码来源:attr_util_mk.py


示例20: handleSubject1

def handleSubject1(outputFile):
    """
    :return: dict
    """
    index = 0
    termdict = dict()
    subjectList = list()

    f = open("data/topic/subject1_w_date.txt")
    for item in f:
        array = item.strip().split("DELIMER")
        count = array[0]
        subject = array[3]

        for (regex, repl) in helper.regexList.items():
            subject = regex.sub(repl, subject)
        for s in helper.specialSet:
            subject = subject.replace(s, "")

        termList = nltk.regexp_tokenize(subject, helper.nltkPattern)  # use nltk-package to participle the subject
        s = ""
        for term in termList:
            if term.lower() not in helper.excludeSet:
                s += term + " "  # reconstruct the subject
                if term not in termdict:
                    termdict[term.strip()] = index
                    index += 1

        if s != "":
            regex = re.compile("\s+")
            s = regex.sub(" ", s)
            subjectList.append("{}DELIMER{}DELIMER{}DELIMER{}".format(count, array[1], array[2], s.strip()))

    fileHelper.writeIterableToFile(outputFile, subjectList)
    return termdict
开发者ID:cynricshu,项目名称:ChinaVis2016,代码行数:35,代码来源:handleSubject.py



注:本文中的nltk.regexp_tokenize函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python nltk.sent_tokenize函数代码示例发布时间:2022-05-27
下一篇:
Python nltk.pos_tag函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap