本文整理汇总了Python中unicodedata.decomposition函数的典型用法代码示例。如果您正苦于以下问题:Python decomposition函数的具体用法?Python decomposition怎么用?Python decomposition使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了decomposition函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: uni2tex
def uni2tex(text):
out = ""
txt = tuple(text)
i = 0
while i < len(txt):
char = text[i]
code = ord(char)
# Elsevier bibtex dumps sometimes have a fancy dash
if code == 8211:
out += "-"
# combining marks
elif unicodedata.category(char) in ("Mn", "Mc") and code in accents:
out += "{\\%s%s}" %(accents[code], txt[i+1])
i += 1
# precomposed characters
elif unicodedata.decomposition(char):
base, acc = unicodedata.decomposition(char).split()
acc = int(acc, 16)
base = int(base, 16)
if acc in accents:
out += "\\%s{%s}" %(accents[acc], unichr(base))
else:
out += char
else:
out += char
i += 1
return out
开发者ID:SCOREC,项目名称:scorec-refs,代码行数:30,代码来源:toascii.py
示例2: uni2tex
def uni2tex(text):
# Courtesy of https://tex.stackexchange.com/q/23410
accents = {
0x0300: '`', 0x0301: "'", 0x0302: '^', 0x0308: '"',
0x030B: 'H', 0x0303: '~', 0x0327: 'c', 0x0328: 'k',
0x0304: '=', 0x0331: 'b', 0x0307: '.', 0x0323: 'd',
0x030A: 'r', 0x0306: 'u', 0x030C: 'v',
}
out = ""
txt = tuple(text)
i = 0
while i < len(txt):
char = text[i]
code = ord(char)
# combining marks
if unicodedata.category(char) in ("Mn", "Mc") and code in accents:
out += "\\%s{%s}" % (accents[code], txt[i+1])
i += 1
# precomposed characters
elif unicodedata.decomposition(char):
base, acc = unicodedata.decomposition(char).split()
acc = int(acc, 16)
base = int(base, 16)
if acc in accents:
out += "\\%s{%s}" % (accents[acc], chr(base))
else:
out += char
else:
out += char
i += 1
return out
开发者ID:TianpeiLuke,项目名称:labella.py,代码行数:32,代码来源:tex.py
示例3: uni2tex
def uni2tex(text):
"""
Translate accented unicode characters intro latex macros.
http://tex.stackexchange.com/questions/23410/how-to-convert-characters-to-latex-code
"""
out = ""
txt = tuple(text)
i = 0
while i < len(txt):
char = text[i]
code = ord(char)
# combining marks
if unicodedata.category(char) in ("Mn", "Mc") and code in accents:
out += "{\\%s{%s}}" % (accents[code], txt[i + 1])
i += 1
# precomposed characters
elif unicodedata.decomposition(char):
base, acc = unicodedata.decomposition(char).split()
acc = int(acc, 16)
base = int(base, 16)
if acc in accents:
out += "{\\%s{%s}}" % (accents[acc], chr(base))
else:
out += char
# other special case
elif char in specials:
out += "{%s}" % specials[char]
else:
out += char
i += 1
return out
开发者ID:jdumas,项目名称:autobib,代码行数:35,代码来源:latex.py
示例4: getDecompositionData
def getDecompositionData(u,missingMarks):
# inside so we can use umap, nmap ...
udec = None
try:
dec = unicodedata.decomposition(unichr(u))
if len(dec) > 1:
if not dec[:1] == "<":
udec = [int(s, 16) for s in dec.split()]
decall = 0
for ud in udec:
if ud in SKIP_MARKS_FINAL: # if mark is in SKIP_MARKS_FINAL we don't want to do any decomposition
return 0
if ud in umap:
decall += 1
else:
if ud not in SKIP_MARKS_FINAL \
and ud in MARK_GLYPH_CODEPOINT_RANGE:
missingMarks += [unicodeIntToHexstr(ud)]
# if decall == len(udec) and decall == 1:
# print "SAME:",umap[u],[umap[ud] for ud in udec]
if decall == len(udec) and decall > 1: # the last condition may go for the sake of allowing reference to same-shape glyphs
return umap[u],[umap[ud] for ud in udec],udec[0] # last one is the one to check next
except ValueError:
return 0
return 0
开发者ID:davelab6,项目名称:ttfdiet,代码行数:25,代码来源:ttfdiet.py
示例5: remove_accents
def remove_accents(chars):
"""Divides a given string into decomposable and undecomposable characters."""
decomposable = []
undecomposable = []
for c in chars:
de = unicodedata.decomposition(c)
if de:
dechars = de.split(None)
try:
# Only keep characters with a decimal value < 300
dechars = map(lambda i: int(i, 16), dechars)
dechars = filter(lambda i: i < 300, dechars)
dechars = map(unichr, dechars)
de = "".join(dechars)
except (IndexError, ValueError):
if ord(c) in CHAR_REPLACEMENT:
de = CHAR_REPLACEMENT[ord(c)]
else:
dechars = filter(lambda s: s[0] != "<", dechars)
dechars = map(lambda i: int(i, 16), dechars)
dechars = map(unichr, dechars)
de = "".join(dechars)
undecomposable.append((c, de))
else:
decomposable.append((c, de))
else:
if ord(c) in CHAR_REPLACEMENT:
de = CHAR_REPLACEMENT[ord(c)]
undecomposable.append((c, de))
return decomposable, undecomposable
开发者ID:Wet-Host,项目名称:Basic-Theme,代码行数:30,代码来源:generate_remove_accents_tests.py
示例6: string2filename
def string2filename(s):
"""convert a string to a valid filename"""
s = s.strip()
s = s.lower()
# remove an eventual path
s = s.replace("\\","/")
_, s = os.path.split(s)
res = u''
mkeys = mapping.keys()
for c in s:
o = ord(c)
if o in mapping.keys():
res = res+mapping[o]
continue
if decomposition(c):
res = res + normalize('NFKD', c)
else:
res = res + c
valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
res = ''.join(c for c in res if c in valid_chars)
res = res.replace(" ","-")
return res
开发者ID:mrtopf,项目名称:jmstvcamp,代码行数:27,代码来源:utils.py
示例7: getdetails
def getdetails(self, text):
chardetails = {}
for character in text:
chardetails[character] = {}
chardetails[character]['Name'] = unicodedata.name(character)
chardetails[character]['HTML Entity'] = str(ord(character))
chardetails[character]['Code point'] = repr(character)
try:
chardetails[character]['Numeric Value'] = \
unicodedata.numeric(character)
except:
pass
try:
chardetails[character]['Decimal Value'] = \
unicodedata.decimal(character)
except:
pass
try:
chardetails[character]['Digit'] = unicodedata.digit(mychar)
except:
pass
chardetails[character]['Alphabet'] = str(character.isalpha())
chardetails[character]['Digit'] = str(character.isdigit())
chardetails[character]['AlphaNumeric'] = str(character.isalnum())
chardetails[character]['Canonical Decomposition'] = \
unicodedata.decomposition(character)
chardetails['Characters'] = list(text)
return chardetails
开发者ID:copyninja,项目名称:chardetails,代码行数:29,代码来源:core.py
示例8: buildCompatChars
def buildCompatChars(sfd, ttf):
zwj = u'\u200D'
ranges = (
(0xfb50, 0xfbb1),
(0xfbd3, 0xfd3d),
(0xfd50, 0xfdf9),
(0xfdfc, 0xfdfc),
(0xfe70, 0xfefc),
)
with open(ttf, "rb") as f:
data = f.read()
blob = HarfBuzz.glib_blob_create(GLib.Bytes.new(data))
face = HarfBuzz.face_create(blob, 0)
hbfont = HarfBuzz.font_create(face)
upem = HarfBuzz.face_get_upem(face)
HarfBuzz.font_set_scale(hbfont, upem, upem)
HarfBuzz.ot_font_set_funcs(hbfont)
ttfont = TTFont(ttf)
for r in ranges:
for c in range(r[0], r[1]+1):
dec = ucd.decomposition(unichr(c)).split()
if dec:
keyword = dec[0]
text = u''
for i in dec[1:]:
text += unichr(int(str(i),16))
if keyword == '<initial>':
text = text + zwj
elif keyword == '<final>':
text = zwj + text
elif keyword == '<medial>':
text = zwj + text + zwj
components = shape(text, hbfont)
if components:
glyph = sfd.createChar(c)
glyph.clear()
glyph.color = 0xff0000 # red color
x = 0
for component in components:
gid = component[0]
name = ttfont.getGlyphName(gid)
x_advance = component[1]
x_offset = component[2]
y_offset = component[3]
matrix = psMat.translate(x + x_offset, y_offset)
# ignore blank glyphs, e.g. space or ZWJ
if sfd[name].foreground or sfd[name].references:
glyph.addReference(name, matrix)
x += x_advance
glyph.width = x
开发者ID:Ashraf-Ali-aa,项目名称:amiri-font,代码行数:60,代码来源:build_compat.py
示例9: asciify
def asciify(string):
'''
"ASCIIfy" a Unicode string by stripping all umlauts, tildes, etc.
This very cool function originates at
http://www.physic.ut.ee/~kkannike/english/prog/python/index.html
'''
# Unfortunately, I don’t really understand, how this function works.
# I have a hunch, this could be done better with a decomposed representation
# of the string ("NFKD"), but I don’t have time to really test a function
# as sensitive as this one right now.
# To work reliably the way it is, strings must consist of composed
# characters.
string = normalize("NFC", string)
temp = u''
for char in string:
decomp = decomposition(char)
if decomp: # Not an empty string
d = decomp.split()[0]
try:
temp += unichr(int(d, 16))
except ValueError:
if d == "<super>":
temp += unichr(int(decomp.split()[1], 16))
else:
pass
#raise Exception("Can't handle this: " + repr(decomp))
else:
temp += char
return temp
开发者ID:dvorberg,项目名称:t4,代码行数:32,代码来源:title_to_id.py
示例10: extended_unicode_model
def extended_unicode_model(list):
"""
Takes as input a list of QLC-formatted words and outputs a unigram model.
"""
segments_hash = collections.defaultdict(int)
segment_count = 0
for word in list:
word = word.strip()
segments = word.split()
for segment in segments:
segment_count += 1
segments_hash[segment] += 1
segments_sorted = sorted(segments_hash.items(), key=operator.itemgetter(1), reverse=True)
# print("Phone"+"\t"+"Int"+"\t"+"Count"+"\t"+"Frequency") # +"\t"+"plog")
print("Char"+"\t"+"int"+"\t"+"Unicode name"+"\t"+"category"+"\t"+"comb class"+"\t"+"decomposition"+"\t"+"count"+"\t"+"frequency")
for segment in segments_sorted:
segment, count = segment[0], segment[1]
frequency = segments_hash[segment]/segment_count
# decimal = unicodedata.decimal(segment)
name = unicodedata.name(segment)
category = unicodedata.category(segment)
combining_class = unicodedata.combining(segment)
decomposition = unicodedata.decomposition(segment)
print(segment+"\t"+str(ord(segment))+"\t"+name+"\t"+category+"\t"+str(combining_class)+"\t"+decomposition+"\t"+str(count)+"\t"+str(frequency))
开发者ID:bambooforest,项目名称:matrix,代码行数:30,代码来源:ngram.py
示例11: mapchar
def mapchar(self, key):
if key in self:
return self[key]
de = unicodedata.decomposition(unichr(key))
if de:
try:
ch = int(de.split(None, 1)[0], 16)
except (IndexError, ValueError):
ch = key
else:
ch = CHAR_REPLACEMENT.get(unichr(key), key)
if ch == 32: # space
pass
elif 47 < ch < 58: # digits
pass
elif 64 < ch < 91: # uppercase
pass
elif 96 < ch < 123: # lowercase
pass
elif 127 < ch < 165: # upper ascii latin1
pass
elif ch == 9: # map tab to space
ch = 32
elif ch < 128: # reject invalid lower ascii
ch = None
elif ch in (152, 158) or ch < 256:
ch = None
self[key] = ch
return ch
开发者ID:ludoo,项目名称:django-autoslug,代码行数:29,代码来源:transliterate.py
示例12: isvalidaccelerator
def isvalidaccelerator(accelerator, acceptlist=None):
"""returns whether the given accelerator character is valid
@type accelerator: character
@param accelerator: A character to be checked for accelerator validity
@type acceptlist: String
@param acceptlist: A list of characters that are permissible as accelerators
@rtype: Boolean
@return: True if the supplied character is an acceptable accelerator
"""
assert isinstance(accelerator, unicode)
assert isinstance(acceptlist, unicode) or acceptlist is None
if len(accelerator) == 0:
return False
if acceptlist is not None:
acceptlist = data.normalize(acceptlist)
if accelerator in acceptlist:
return True
return False
else:
# Old code path - ensures that we don't get a large number of regressions
accelerator = accelerator.replace("_","")
if accelerator in u"-?":
return True
if not accelerator.isalnum():
return False
# We don't want to have accelerators on characters with diacritics, so let's
# see if the character can decompose.
decomposition = unicodedata.decomposition(accelerator)
# Next we strip out any extra information like <this>
decomposition = re.sub("<[^>]+>", "", decomposition).strip()
return decomposition.count(" ") == 0
开发者ID:AndryulE,项目名称:kitsune,代码行数:33,代码来源:decoration.py
示例13: deaccent_char
def deaccent_char(c):
decomposed = unicodedata.decomposition(c)
if decomposed:
basechar = int(decomposed.split(' ')[0], 16)
return chr(basechar)
else:
return c
开发者ID:hsoft,项目名称:dtf,代码行数:7,代码来源:models.py
示例14: normalizeUnicode
def normalizeUnicode(text, encoding='humanascii'):
"""
This method is used for normalization of unicode characters to the base ASCII
letters. Output is ASCII encoded string (or char) with only ASCII letters,
digits, punctuation and whitespace characters. Case is preserved.
"""
if text == "":
return ""
unicodeinput = True
if not isinstance(text, unicode):
text = unicode(text, 'utf-8')
unicodeinput = False
res = ''
global allowed, allowedid
if encoding == 'humanascii' or encoding == 'identifier':
enc = 'ascii'
else:
enc = encoding
for ch in text:
if (encoding == 'humanascii') and (ch in allowed):
# ASCII chars, digits etc. stay untouched
res += ch
continue
if (encoding == 'identifier') and (ch in allowedid):
# ASCII chars, digits etc. stay untouched
res += ch
continue
else:
try:
ch.encode(enc,'strict')
if encoding == 'identifier':
res += '_'
else:
res += ch
except UnicodeEncodeError:
ordinal = ord(ch)
if mapping.has_key(ordinal):
# try to apply custom mappings
res += mapping.get(ordinal)
elif decomposition(ch) or len(normalize('NFKD',ch)) > 1:
normalized = filter(lambda i: not combining(i), normalize('NFKD', ch)).strip()
# normalized string may contain non-letter chars too. Remove them
# normalized string may result to more than one char
if encoding == 'identifier':
res += ''.join([c for c in normalized if c in allowedid])
else:
res += ''.join([c for c in normalized if c in allowed])
else:
# hex string instead of unknown char
res += "%x" % ordinal
if encoding == 'identifier':
res = res.strip('_').replace('_____','_').replace('____','_').replace('___','_').replace('__','_')
if not res.strip('_')[0] in string.ascii_letters:
res = '_' + res
if unicodeinput:
return res
else:
return res.encode('utf-8')
开发者ID:8-armedcat,项目名称:PyFileMaker,代码行数:60,代码来源:UnicodeNormalizer.py
示例15: string2filename
def string2filename(s, path = None, default=u"anonymous"):
"""convert a string to a valid filename"""
from unicodedata import decomposition, normalize
# TODO: make it a better conversion?
if type(s) != types.UnicodeType:
s = unicode(s)
s = s.strip()
s = s.lower()
if s=="":
s = default
# remove an eventual path
s = s.replace("\\","/")
_, s = os.path.split(s)
res = u''
mkeys = mapping.keys()
for c in s:
o = ord(c)
if o in mapping.keys():
res = res+mapping[o]
continue
if decomposition(c):
res = res + normalize('NFKD', c)
else:
res = res + c
valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
filename = ''.join(c for c in res if c in valid_chars)
filename = filename.replace(" ","_")
# if path is not None we can check if there already is a file with that name
if path is None:
return filename
fullpath=os.path.join(path, filename)
if not os.path.exists(fullpath):
return filename
# remove the extension
root, ext = os.path.splitext(filename)
for idx in range(1,100):
filename = "%s-%d%s" %(root, idx, ext)
if not os.path.exists(os.path.join(path,filename)):
return filename
for idx in range(1,100):
u = unicode(uuid.uuid4())
filename = "%s-%s%s" %(root, u, ext)
if not os.path.exists(os.path.join(path,filename)):
return filename
return None # we did not get a result, TODO: further checking
开发者ID:comlounge,项目名称:userbase,代码行数:59,代码来源:hooks.py
示例16: mapchar
def mapchar(self, key):
ch = self.get(key)
if ch is not None:
return ch
if sys.version_info >= (3, 0):
de = unicodedata.decomposition(chr(key))
else:
de = unicodedata.decomposition(unichr(key))
if de:
try:
ch = int(de.split(None, 1)[0], 16)
except (IndexError, ValueError):
ch = key
else:
ch = CHAR_REPLACEMENT.get(key, key)
self[key] = ch
return ch
开发者ID:aldryn,项目名称:aldryn-segmentation,代码行数:17,代码来源:unaccent.py
示例17: buildCompatChars
def buildCompatChars(font, hbfont):
zwj = u'\u200D'
ranges = (
(0xfb50, 0xfbb1),
(0xfbd3, 0xfd3d),
(0xfd50, 0xfdf9),
(0xfdfc, 0xfdfc),
(0xfe70, 0xfefc),
)
text = u''
codes = []
for r in ranges:
for c in range(r[0], r[1]+1):
dec = ucd.decomposition(unichr(c)).split()
if dec:
codes.append(c)
keyword = dec[0]
new_text = u''
for i in dec[1:]:
new_text += unichr(int(str(i),16))
if keyword == '<initial>':
new_text = new_text + zwj
elif keyword == '<final>':
new_text = zwj + new_text
elif keyword == '<medial>':
new_text = zwj + new_text + zwj
text += new_text + '\n'
lines = runHB(text, hbfont)
i = 0
for c in codes:
components = lines[i]
i += 1
if components:
glyph = font.createChar(c)
glyph.clear()
glyph.color = 0xff0000 # red color
x = 0
for component in components:
name = component[0]
x_advance = component[1]
y_advance = component[2]
x_offset = component[3]
y_offset = component[4]
matrix = psMat.translate(x + x_offset, y_offset)
# ignore blank glyphs, e.g. space or ZWJ
if font[name].foreground or font[name].references:
glyph.addReference(name, matrix)
x += x_advance
glyph.width = x
开发者ID:EbenSorkin,项目名称:Jomhuria,代码行数:57,代码来源:build_compat.py
示例18: make_index_value
def make_index_value(display_name):
buf = bytearray()
for ch in display_name:
decomposition = unicodedata.decomposition(ch)
if len(decomposition) > 0:
ch = chr(int(decomposition.split()[0], 16))
if ch >= 'a' and ch <= 'z':
buf.append(ord(ch))
return buf.decode("ASCII")
开发者ID:bpeel,项目名称:catverbs,代码行数:11,代码来源:compile.py
示例19: normalizeRtlString
def normalizeRtlString(s):
l=[]
for c in s:
#If this is an arabic presentation form b character (commenly given by Windows when converting from glyphs)
#Decompose it to its original basic arabic (non-presentational_ character.
if 0xfe70<=ord(c)<=0xfeff:
d=unicodedata.decomposition(c)
d=d.split(' ') if d else None
if d and len(d)==2 and d[0] in ('<initial>','<medial>','<final>','<isolated>'):
c=unichr(int(d[1],16))
l.append(c)
return u"".join(l)
开发者ID:sonar-gnu-linux,项目名称:nvda,代码行数:12,代码来源:displayModel.py
示例20: asciify
def asciify(string):
"""
gets rid of pesky things like umlauts and tildes and other accents. ascii all the way, baby.
"""
temp = u''
for char in string:
decomp = decomposition(char)
if decomp: # Not an empty string
temp += unichr(int(decomp.split()[0], 16))
else:
temp += char
return temp
开发者ID:shajith,项目名称:hu,代码行数:12,代码来源:hu_lastfm.py
注:本文中的unicodedata.decomposition函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论