• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

Python unicodedata.name函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中unicodedata.name函数的典型用法代码示例。如果您正苦于以下问题:Python name函数的具体用法?Python name怎么用?Python name使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了name函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: jatokenize

def jatokenize(content):
    ret_list = []
    lines = tagger.parse(content).split('\n')
    for line in lines:
        if line == "EOS":
            break
        line = line.split('\t')
        word = line[2]

        try:
            jtype = unicodedata.name(word[0])
        except:
            continue
        # 漢字でない一文字のwordは無視
        # 'ー'や'*'も同様
        if len(word) == 1 and jtype[0:4] != 'CJK ':
            continue
        # 二文字のひらがなは無視
        if (len(word) == 2 and jtype[0:4] == 'HIRA'
                and unicodedata.name(word[1])[0:4] == 'HIRA'):
            continue
        if jtype[0:4] == 'LATI':
            continue
        if word.isdigit():
            continue
        if (line[3][:2] == '名詞' or line[3][:2] == '動詞'
                or line[3][:2] == '副詞' or line[3][:3] == '形容詞'):
            ofs.write("%s " % word)
            ret_list.append(word.encode('utf8'))
    ofs.write("\n")
    return ret_list
开发者ID:garytaro,项目名称:jawikicorpus,代码行数:31,代码来源:jawikicorpus.py


示例2: data

 def data(self, index, role ):
     global UC_CAT_EXPAND, COL_ALIGNMENT, COL_TOOLTIPS
     (char, count) = self.chardata.get_tuple(index.row())
     if role == Qt.DisplayRole : # request for actual data
         if 0 == index.column():
             return char
         elif 1 == index.column():
             return '0x{0:04x}'.format(ord(char))
         elif 2 == index.column():
             return count
         elif 3 == index.column():
             if char in C.NAMED_ENTITIES :
                 return '&' + C.NAMED_ENTITIES[char] + ';'
             else:
                 return '&#{0:d};'.format(ord(char))
         elif 4 == index.column():
             return UC_CAT_EXPAND[unicodedata.category(char).lower()]
         else: # assuming column is 5, unicode name
             return unicodedata.name(char,'no name?').title()
     elif (role == Qt.TextAlignmentRole) :
         return COL_ALIGNMENT[index.column()]
     elif (role == Qt.ToolTipRole) or (role == Qt.StatusTipRole) :
         if index.column() < 5 :
             return COL_TOOLTIPS[index.column()]
         # For column 5, the tooltip is the name string, because a narrow
         # column may not expose the entire name any other way.
         return unicodedata.name(char,'no name?').title()
     # Sorry, we don't support other roles
     return None
开发者ID:B-Rich,项目名称:PPQT2,代码行数:29,代码来源:charview.py


示例3: codepoint_simple

def codepoint_simple(arg):
    arg = arg.upper()

    r_label = re.compile('\\b' + arg.replace(' ', '.*\\b') + '\\b')

    results = []
    for cp in xrange(0xFFFF):
        u = unichr(cp)
        try:
            name = unicodedata.name(u)
        except ValueError:
            continue

        if r_label.search(name):
            results.append((len(name), u, cp, name))
    if not results:
        r_label = re.compile('\\b' + arg.replace(' ', '.*\\b'))
        for cp in xrange(0xFFFF):
            u = unichr(cp)
            try:
                name = unicodedata.name(u)
            except ValueError:
                continue

            if r_label.search(name):
                results.append((len(name), u, cp, name))

    if not results:
        return None

    length, u, cp, name = sorted(results)[0]
    return about(u, cp, name)
开发者ID:incuna,项目名称:ircbot,代码行数:32,代码来源:codepoints.py


示例4: _do_write

 def _do_write(fname, variable, version, date, table):
     print("writing {} ..".format(fname))
     import unicodedata
     import datetime
     import string
     utc_now = datetime.datetime.now(tz=datetime.timezone.utc)
     INDENT = 4
     with open(fname, 'w') as fp:
         fp.write("# Generated: {iso_utc}\n"
                  "# Source: {version}\n"
                  "# Date: {date}\n"
                  "{variable} = (".format(iso_utc=utc_now.isoformat(),
                                          version=version,
                                          date=date,
                                          variable=variable))
         for start, end in table:
             ucs_start, ucs_end = unichr(start), unichr(end)
             hex_start, hex_end = ('0x{0:04x}'.format(start),
                                   '0x{0:04x}'.format(end))
             try:
                 name_start = string.capwords(unicodedata.name(ucs_start))
             except ValueError:
                 name_start = u''
             try:
                 name_end = string.capwords(unicodedata.name(ucs_end))
             except ValueError:
                 name_end = u''
             fp.write('\n' + (' ' * INDENT))
             fp.write('({0}, {1},),'.format(hex_start, hex_end))
             fp.write('  # {0:24s}..{1}'.format(
                 name_start[:24].rstrip() or '(nil)',
                 name_end[:24].rstrip()))
         fp.write('\n)\n')
     print("complete.")
开发者ID:thomasballinger,项目名称:wcwidth,代码行数:34,代码来源:setup.py


示例5: test_cjk

 def test_cjk(self):
     import sys
     import unicodedata
     cases = ((0x3400, 0x4DB5),
              (0x4E00, 0x9FA5))
     if unicodedata.unidata_version >= "4.1":
         cases = ((0x3400, 0x4DB5),
                  (0x4E00, 0x9FBB),
                  (0x20000, 0x2A6D6))
     for first, last in cases:
         # Test at and inside the boundary
         for i in (first, first + 1, last - 1, last):
             charname = 'CJK UNIFIED IDEOGRAPH-%X'%i
             char = ('\\U%08X' % i).decode('unicode-escape')
             assert unicodedata.name(char) == charname
             assert unicodedata.lookup(charname) == char
         # Test outside the boundary
         for i in first - 1, last + 1:
             charname = 'CJK UNIFIED IDEOGRAPH-%X'%i
             char = ('\\U%08X' % i).decode('unicode-escape')
             try:
                 unicodedata.name(char)
             except ValueError, e:
                 assert e.message == 'no such name'
             raises(KeyError, unicodedata.lookup, charname)
开发者ID:ParitoshThapliyal59,项目名称:pypy,代码行数:25,代码来源:test_unicodedata.py


示例6: extractKeyword

def extractKeyword(text,word_class=["名詞","形容詞"]):
	tmp = splitTag(text) #まずハッシュタグを抽出
	text = tmp[0]
	keywords = tmp[1]
	tagger = MeCab.Tagger('-Ochasen')
	node = tagger.parseToNode(text.encode('utf-8'))
	while node:
		try:
			if node.feature.split(',')[0] in word_class:
			#print node.surface
				uniname = node.surface.decode('utf-8')[0] #名詞の一文字目 ↓で数字、ひらがな、カタカナ、漢字、アルファベットのみをkeywordsに追加
				if (unicodedata.name(uniname)[0:8] == "HIRAGANA") or (unicodedata.name(uniname)[0:8] == "KATAKANA") or (unicodedata.name(uniname)[0:18] == "HALFWIDTH KATAKANA") or (unicodedata.name(uniname)[0:3] == "CJK") or (unicodedata.name(uniname)[0:5] == "LATIN") or (unicodedata.name(uniname)[0:5] == "DIGIT"):
					term = node.surface.replace('*','*')
					term = term.replace('"','”')
					term = term.replace("'","’")
					keywords.append(term.decode('utf-8'))
					#print node.surface.decode('utf-8')
		except Exception as e:
			print "-"*10
			print "エラー(MeCab)"
			print node.surface
			print str(type(e))
			print str(e.args)
			print e.message
			print str(e)
			print "-"*10
		node = node.next
	return keywords
开发者ID:hanaken,项目名称:GetYouTrend,代码行数:28,代码来源:GetYouTrend.py


示例7: test_cjk

    def test_cjk(self):
        import sys
        import unicodedata

        cases = ((0x3400, 0x4DB5), (0x4E00, 0x9FA5))
        if unicodedata.unidata_version >= "5":  # don't know the exact limit
            cases = ((0x3400, 0x4DB5), (0x4E00, 0x9FCB), (0x20000, 0x2A6D6), (0x2A700, 0x2B734))
        elif unicodedata.unidata_version >= "4.1":
            cases = ((0x3400, 0x4DB5), (0x4E00, 0x9FBB), (0x20000, 0x2A6D6))
        for first, last in cases:
            # Test at and inside the boundary
            for i in (first, first + 1, last - 1, last):
                charname = "CJK UNIFIED IDEOGRAPH-%X" % i
                char = ("\\U%08X" % i).decode("unicode-escape")
                assert unicodedata.name(char) == charname
                assert unicodedata.lookup(charname) == char
            # Test outside the boundary
            for i in first - 1, last + 1:
                charname = "CJK UNIFIED IDEOGRAPH-%X" % i
                char = ("\\U%08X" % i).decode("unicode-escape")
                try:
                    unicodedata.name(char)
                except ValueError, e:
                    assert e.message == "no such name"
                raises(KeyError, unicodedata.lookup, charname)
开发者ID:cimarieta,项目名称:usp,代码行数:25,代码来源:test_unicodedata.py


示例8: clean_Ustring_fromU

def clean_Ustring_fromU(string):
    from unicodedata import name, normalize
    gClean = ''
    for ch in u''.join(string.decode('utf-8', 'ignore')):
        try:
            if name(ch).startswith('LATIN') or name(ch) == 'SPACE':
                gClean = gClean + ch
            else: # Remove non-latin characters and change them by spaces
                gClean = gClean + ' '
        except ValueError: # In the case name of 'ch' does not exist in the unicode database.
            gClean = gClean + ' '
    
    try: # Trying different cases for bad input documents.
        normalized_string = normalize('NFKC', gClean.lower())
    except TypeError:
        #sys.stderr.write('Bad formed string at the first attempt\n')
        try:
            range_error = 999
            normalized_string = normalize('NFKC', gClean[0:range_error].lower()) # One thousand of characters are written if available. 
        except TypeError:
            #sys.stderr.write('\nThe wrong string at the second attempt: before %s words' % range_error)
            try:
                range_error = 99
                normalized_string = normalize('NFKC', gClean[0:range_error].lower())
            except TypeError:
                #sys.stderr.write('\nThe wrong string at the third attempt: before %s words' % range_error)
                try:
                    range_error = 49
                    normalized_string = normalize('NFKC', gClean[0:range_error].lower())
                except TypeError:    
                    #sys.stderr.write('\nIt was not possible forming output file after three attempts. Fatally bad file')
                    normalized_string = '# Fatally bad File\n'
                    pass
    return  normalized_string.split() # Return the unicode normalized document.
开发者ID:iarroyof,项目名称:nlp-pipeline,代码行数:34,代码来源:combiner.py


示例9: get_unicode_str

def get_unicode_str(size=10, max_char=0xFFFF, onlyNormalized=False, includeUnexisting=False):
    '''
    generates valid (for current OS) Unicode file name
    Notice: if includeUnexisting==True, it is possible that files don't get synchronized
    '''
    if platform.system() == "Windows":
        # Unicode characters 1 through 31, as well as quote ("), less than (<), greater than (>), pipe (|), backspace (\b), null (\0) and tab (\t).
        exclude = string.punctuation + u"\t" +  u''.join([unichr(x) for x in range(0, 32)])
    else:
        # I guess it mainly depends on fs type
        #exclude = u"/" + u"." + u''.join([unichr(x) for x in range(0, 1)])
        exclude = u"/" + u"." + u''.join([unichr(x) for x in range(0, 32)])


    name = u""
    while len(name) < size:
        c = unichr(random.randint(0, max_char))
        if c not in exclude:
            try:
                if not includeUnexisting:
                    unicodedata.name(c) #this will cause invalid unicode character to throw exception
                if onlyNormalized:
                    name = name + unicodedata.normalize('NFC',c) #only normalized chars
                else:
                    name = name + c
            except ValueError:
                pass
    return name
开发者ID:Anonymodmous,项目名称:sdk,代码行数:28,代码来源:sync_test_base.py


示例10: format

 def format(self, stream, args):
     char = unicode(args.next())
     if len(char) != 1:
         raise TypeError("expected single character")
     if self.atsign:
         if char in python_escapes:
             stream.write('"\\%s"' % python_escapes[char])
         else:
             try:
                 stream.write('u"\\N{%s}"' % unicodedata.name(char))
             except ValueError:
                 stream.write(repr(char))
     else:
         if unicodedata.category(char).startswith("C"):
             try:
                 stream.write(unicodedata.name(char))
             except ValueError:
                 code = ord(char)
                 if code in ascii_control_chars:
                     i = 1 if self.colon else 0
                     stream.write(ascii_control_chars[code][i])
                 else:
                     raise FormatError("unprintable character")
         else:
             stream.write(char)
开发者ID:plotnick,项目名称:prettyprinter,代码行数:25,代码来源:format.py


示例11: showdict

def showdict(data, indent):
    first=True
    for key in sorted(data.keys()):
        value=data[key]
        if first:
            first=False
        else:
            print
        print " "*max(indent,0) + "("+key,
        # Sneaky trick: we don't want to go newline-indent over and
        # over for long sequences, i.e. cases where there is only
        # one possible follower.  So we skip the newlines in those
        # cases, and tell the next-lower iteration not to do the whole
        # indent thing by passing a negative indent.  We don't just
        # pass 0 or 1 because if another iteration *further down*
        # turns out not to be an only case, it will need to know
        # the right indent to pass along.  So a case like 
        # R-O-{CK|LL}, the O is unique after the R, so no linefeed,
        # but then the {C|L} are not unique after the O.
        if type(value)==dict:
            if len(value)>1:
                print ""
                showdict(value, abs(indent)+4),
            else:
                showdict(value, -(abs(indent)+4)),
        else:
            print "    "+value.encode('utf-8'),
            if "-n" in sys.argv:
                try:
                    print unicodedata.name(value),
                except:
                    pass
        print ")",
开发者ID:dcherian,项目名称:xcompose,代码行数:33,代码来源:treeprint.py


示例12: report_code_points

def report_code_points(char_class, code_point_list, text=''):
    '''Report all code points which have been added to or removed from a
    character class.
    '''
    for code_point in sorted(code_point_list):
        if type(code_point) == type(int()):
            print('%(char_class)s: %(text)s: %(char)s %(code_point)s %(name)s'
                  %{'text': text,
                    'char': chr(code_point),
                    'char_class': char_class,
                    'code_point': hex(code_point),
                    'name': unicodedata.name(chr(code_point), 'name unknown')})
        else:
            print(('%(char_class)s: %(text)s: '
                   + '%(char0)s → %(char1)s '
                   + '%(code_point0)s → %(code_point1)s '
                   + '%(name0)s → %(name1)s') %{
                'text': text,
                'char_class': char_class,
                'char0': chr(code_point[0]),
                'code_point0': hex(code_point[0]),
                'name0': unicodedata.name(chr(code_point[0]), 'name unknown'),
                'char1': chr(code_point[1]),
                'code_point1': hex(code_point[1]),
                'name1': unicodedata.name(chr(code_point[1]), 'name unknown')
            })
开发者ID:bminor,项目名称:glibc,代码行数:26,代码来源:ctype_compatibility.py


示例13: test_cjk

 def test_cjk(self):
     import sys
     if sys.maxunicode < 0x10ffff:
         skip("requires a 'wide' python build.")
     import unicodedata
     cases = ((0x3400, 0x4DB5),
              (0x4E00, 0x9FA5))
     if unicodedata.unidata_version >= "4.1":
         cases = ((0x3400, 0x4DB5),
                  (0x4E00, 0x9FBB),
                  (0x20000, 0x2A6D6))
     for first, last in cases:
         # Test at and inside the boundary
         for i in (first, first + 1, last - 1, last):
             charname = 'CJK UNIFIED IDEOGRAPH-%X'%i
             assert unicodedata.name(unichr(i)) == charname
             assert unicodedata.lookup(charname) == unichr(i)
         # Test outside the boundary
         for i in first - 1, last + 1:
             charname = 'CJK UNIFIED IDEOGRAPH-%X'%i
             try:
                 unicodedata.name(unichr(i))
             except ValueError:
                 pass
             raises(KeyError, unicodedata.lookup, charname)
开发者ID:AishwaryaKM,项目名称:python-tutorial,代码行数:25,代码来源:test_unicodedata.py


示例14: safe_path

def safe_path(origtitle):
    title = safe_path_component(ftfy(origtitle))
    
    if len(title) == 0:
        title = origtitle = u'_'

    if title.startswith(u'-') or title.startswith(u'.'):
        title = u'_' + title
    try:
        charname = safe_path_component(unicodedata.name(origtitle[0]))
    except ValueError:
        charname = u'UNKNOWN'
    category = charname.split('_')[0]

    # some ridiculous stuff to give every article a unique name that can be
    # stored on multiple file systems and tab-completed
    if len(origtitle) == 1:
        pieces = [u'single_character', category, charname + '.json']
    else:
        try:
            charname2 = safe_path_component(unicodedata.name(origtitle[1]))
        except ValueError:
            charname2 = u'UNKNOWN'
        text_to_encode = unicodedata.normalize("NFKD", safe_path_component(title[:64]))
        finalpart = text_to_encode.encode('punycode').rstrip('-')
        pieces = [charname, charname2, finalpart + '.json']
    path = u'/'.join(pieces)
    return path
开发者ID:bblack,项目名称:conceptnet5,代码行数:28,代码来源:extract_wiktionary.py


示例15: main

def main():

    # get files 
    files = []
    for i in range(1,29):
        if i < 26:
            files.append("db/Minna_no_nihongo_1.%02d.txt" % i)
        else: 
            files.append("db/Minna_no_nihongo_2.%02d.txt" % i)

    # get words from files
    words = get_words_from_files(files)


    # add words to network
    G=nx.Graph()
    for w in words:
        G.add_node(w)
        G.node[w]['furigana'] = words[w]['furigana']
        G.node[w]['meaning'] = words[w]['meaning']
        G.node[w]['chapter'] = words[w]['chapter']

    # to make statistics
    nbins, dmin, dmax = 20, 0, 1
    hist, edges = np.histogram([0], bins=nbins, range=(dmin, dmax))

    # adding edges
    words = G.nodes()
    print("Total number of words: ",len(words))

    for word1, word2 in itertools.combinations(words,2):
        for w1 in word1:
            for w2 in word2:
                if "CJK UNIFIED" in ud.name(w1) and "CJK UNIFIED" in ud.name(w2):
                    f1, f2 = fingerprint[w1], fingerprint[w2]
                    match = SequenceMatcher(None, f1, f2 , autojunk=True)
                    ratio = match.ratio()

                    # add data to histogram
                    new_hist, edges = np.histogram(ratio, bins=nbins, range=(dmin, dmax))
                    hist += new_hist
                    
                    if ratio > 0.8:                      
                        # G.add_edge(word1, word2, weight=5*ratio-4) # 0.8 - 1 --> 0 - 1
                        G.add_edge(word1, word2, weight=4*ratio-3.2) # 0.8 - 1 --> 0 - 0.8
                        break

    # plot data
    score = 0.5*(edges[1:] + edges[:-1])
    plt.plot(score, hist)
    plt.xlabel("score")
    plt.ylabel("histogram")
    plt.show()


    G = sorted(nx.connected_component_subgraphs(G), key = len, reverse=True)

    print("Total number of words connected: ", len(G[0].nodes()))
    nx.write_graphml(G[0], "kanjis.graphml", encoding='utf-8', prettyprint=True)
开发者ID:yescalona,项目名称:KanjiNetwork,代码行数:59,代码来源:kanjis2.py


示例16: is_in_alphabet

 def is_in_alphabet(self, uchr, alphabet):
     if self.no_memory:
         return not uchr.isalpha() or alphabet in ud.name(uchr)
     try: 
         return self.alphabet_letters[alphabet][uchr]
     except KeyError:
         return self.alphabet_letters[alphabet].setdefault(
             uchr, alphabet in ud.name(uchr))
开发者ID:iAdramelk,项目名称:dungeon_bot,代码行数:8,代码来源:alphabet_detector.py


示例17: is_unicode

    def is_unicode(self, char):
        # http://docs.python.org/2/library/unicodedata.html
        is_unicode = True
        try:
            unicodedata.name(unicode(char))
        except ValueError:
            is_unicode = False

        return is_unicode
开发者ID:guyjacks,项目名称:jeap,代码行数:9,代码来源:interpreter.py


示例18: endElement

	def endElement(self,name):
		if name=='ar':
			self.in_arContent=False
			
			#Now store the entry:
			try:
				
				ch=self.normalize_whitespace(self.article)
				if not len(ch):return
				
				lidx=ch.index('[')
				ridx=ch.index(']')
				self.romanization=ch[lidx+1:ridx]
				split_romanization=string.split(self.romanization,u' ',199)
				self.translation=ch[ridx+1:]
				
				cjktraditional=[]
				cjksimplified=[]
				for cidx in range(len(self.splitkey[0])):
					cjktraditional.append(unicodedata.name(self.splitkey[0][cidx]))
					cjksimplified.append(unicodedata.name(self.splitkey[1][cidx]))
					
				#print self.romanization,self.translation
				entry={
					'traditional':self.splitkey[0],		#uchar string
					'simplified':self.splitkey[1],		#uchar string
					'cjktraditional':cjktraditional,
					'cjksimplified':cjksimplified,
					'romanization':split_romanization,	#list of morphenes
					'frequencies':[],					#filled by post-process with romanized morphene frequencies
					'translation':self.translation,
				}

				if self.dict.has_key(entry['traditional']):
					#print 'already have: ',`entry['traditional']`,entry['romanization']#fontset more likely to have traditional, if any
					#print 'proof:',self.dict[entry['traditional']]['romanization']
					pass
				else:self.dict[entry['traditional']]=entry
				
				
				#Add to distro:
				for item in entry['traditional']:
					try:self.dist[item]+=1
					except:self.dist[item]=1
				
				
				if math.fmod(len(self.dict.keys()),100)==0:
					msglist=[
						"Words  :%6d"%(len(self.dict.keys())),
						"Symbols:%6d"%(len(self.dist.keys()))
					]
					self.progress_message(msglist)
				
			except Exception,e:
				if DEBUG:print e
				
			self.article=u''
开发者ID:anaselli,项目名称:stepintochinese,代码行数:57,代码来源:cp.py


示例19: codePointToCharacter

def codePointToCharacter(cp):
    if cp <= maxSupportedCodePoint():
        char = unichr(cp)
        if cp > 0xFF: # unicodedata doesn't have names for control characters, so skip the name test on the ASCII ones at least
            try:
                unicodedata.name(char)
            except Exception,e:
                raise e
        return char
开发者ID:johnarmstrong,项目名称:koreanlangutils,代码行数:9,代码来源:charutils.py


示例20: symbolEq

def symbolEq(s1,s2):
	if len(s1) != len(s2): return False
	if len(s1) != 0: return False
	try:
		x1 = unicodedata.name(u'%s' % s1.decode('utf-8'))
		x2 = unicodedata.name(u'%s' % s2.decode('utf-8'))
	except:
		return False
	return x1 == x2
开发者ID:GordonEstes,项目名称:AutoFormatter,代码行数:9,代码来源:regexlib.py



注:本文中的unicodedata.name函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python unicodedata.normalize函数代码示例发布时间:2022-05-27
下一篇:
Python unicodedata.mirrored函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap