本文整理汇总了Python中unicodedata.category函数的典型用法代码示例。如果您正苦于以下问题:Python category函数的具体用法?Python category怎么用?Python category使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了category函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: splitText
def splitText(text):
""" Split text into sub segments of size not bigger than MAX_SEGMENT_SIZE. """
segments = []
remaining_text = __class__.cleanSpaces(text)
while len(remaining_text) > __class__.MAX_SEGMENT_SIZE:
cur_text = remaining_text[:__class__.MAX_SEGMENT_SIZE]
# try to split at punctuation
split_idx = __class__.findLastCharIndexMatching(cur_text,
# https://en.wikipedia.org/wiki/Unicode_character_property#General_Category
lambda x: unicodedata.category(x) in ("Ps", "Pe", "Pi", "Pf", "Po"))
if split_idx is None:
# try to split at whitespace
split_idx = __class__.findLastCharIndexMatching(cur_text,
lambda x: unicodedata.category(x).startswith("Z"))
if split_idx is None:
# try to split at anything not a letter or number
split_idx = __class__.findLastCharIndexMatching(cur_text,
lambda x: not (unicodedata.category(x)[0] in ("L", "N")))
if split_idx is None:
# split at the last char
split_idx = __class__.MAX_SEGMENT_SIZE - 1
new_segment = cur_text[:split_idx + 1].rstrip()
segments.append(new_segment)
remaining_text = remaining_text[split_idx + 1:].lstrip(string.whitespace + string.punctuation)
if remaining_text:
segments.append(remaining_text)
return segments
开发者ID:desbma,项目名称:GoogleSpeech,代码行数:32,代码来源:__init__.py
示例2: TokenOffsets
def TokenOffsets(string: str):
"""
Yield the offsets of all Unicode category borders in the *string*,
including the initial 0 and the final offset value of ``len(string)``.
Caplitalized words special case: A single upper case letter ('Lu')
followed by lower case letters ('Ll') are treated as a single token.
"""
if string is not None and len(string) > 0:
yield 0
last = category(string[0])
for i in range(1, len(string)):
current = category(string[i])
if last != current:
# "join" capitalized tokens:
if last == 'Lu' and \
current == 'Ll' and \
(i == 1 or (i > 1 and category(string[i - 2]) != 'Lu')):
pass
else:
yield i
last = current
yield len(string)
开发者ID:fnl,项目名称:libfnl,代码行数:27,代码来源:strtok.py
示例3: ranking
def ranking(self):
"""
For each result, removes stopwords, ranks the word, augments the query
and returns True if successful else False
"""
print "Indexing results ...."
for i in range(len(self.results)):
result = self.results[i]
title = result[0]
summary = result[1]
# Remove punctuation and create lists of words
titleWords = "".join(c for c in title if not unicodedata.category(c).startswith('P')).split()
summaryWords = "".join(c for c in summary if not unicodedata.category(c).startswith('P')).split()
for tw in titleWords:
if tw.lower() in self.stopWords:
continue
if self.user_feedback[i] == 'y':
self.applyRanking(i, tw, True, True)
else:
self.applyRanking(i, tw, True, False)
for sw in summaryWords:
if sw.lower() in self.stopWords:
continue
if self.user_feedback[i] == 'y':
self.applyRanking(i, sw, False, True)
else:
self.applyRanking(i, sw, False, False)
print "Indexing results ...."
return self.augmentQuery()
开发者ID:akshaisarma,项目名称:QueryAugmentation,代码行数:35,代码来源:UI.py
示例4: normalize_roman
def normalize_roman(string, additional=None):
"""Removes diacritics from the string and converts to lowercase.
>>> normalize_roman(u'Eèé')
u'eee'
"""
if additional:
safe = additional.keys() + additional.values()
def gen():
for c in string:
if c not in safe:
yield normalize_roman(c)
elif c in additional:
yield additional[c]
else:
yield c
return ''.join(gen())
else:
chars = []
for c in string:
if unicodedata.category(c) == 'Lo':
chars.append(c)
else:
nor = unicodedata.normalize('NFD', c)
chars.extend(x for x in nor if unicodedata.category(x) != 'Mn')
return ''.join(chars).lower()
开发者ID:Pusungwi,项目名称:hangulize,代码行数:26,代码来源:normalization.py
示例5: test_exclude_characters_of_specific_groups
def test_exclude_characters_of_specific_groups():
st = characters(blacklist_categories=('Lu', 'Nd'))
find(st, lambda c: unicodedata.category(c) != 'Lu')
find(st, lambda c: unicodedata.category(c) != 'Nd')
assert_no_examples(st, lambda c: unicodedata.category(c) in ('Lu', 'Nd'))
开发者ID:doismellburning,项目名称:hypothesis,代码行数:7,代码来源:test_simple_characters.py
示例6: consolidate_ampers
def consolidate_ampers(text: str) -> str:
"""Converts all ampersands in a text to a single one (&).
:param text: A string which should have ampersands converted.
:return: The text string after all ampersands have been replaced.
"""
chosen_amper_value = "\u0026"
amper_values = dict.fromkeys(
[chr(i) for i in range(sys.maxunicode)
# Avoid unnamed control chars throwing ValueErrors
if (unicodedata.category(chr(i)).startswith('P')
or unicodedata.category(chr(i)).startswith('S'))
and re.search(
r" ampersand|ampersand ", unicodedata.name(chr(i)),
re.IGNORECASE) is not None
and chr(i) != chosen_amper_value]
)
# Change all ampersands to one type of ampersand
for value in amper_values:
text = text.replace(value, chosen_amper_value)
return text
开发者ID:WheatonCS,项目名称:Lexos,代码行数:25,代码来源:scrubber.py
示例7: test_exclude_characters_of_specific_groups
def test_exclude_characters_of_specific_groups():
st = characters(blacklist_categories=("Lu", "Nd"))
find_any(st, lambda c: unicodedata.category(c) != "Lu")
find_any(st, lambda c: unicodedata.category(c) != "Nd")
assert_no_examples(st, lambda c: unicodedata.category(c) in ("Lu", "Nd"))
开发者ID:HypothesisWorks,项目名称:hypothesis-python,代码行数:7,代码来源:test_simple_characters.py
示例8: tokens
def tokens(source):
p = 0
while p < len(source):
ch = source[p]
cat = category(ch)
if ch in NEWLINE_CHARS:
yield NewlineToken(source[p])
p += 1
elif cat[0] in "CZ":
q = p + 1
while q < len(source) and category(source[q])[0] in "CZ":
q += 1
yield WhitespaceToken(source[p:q])
p = q
elif cat[0] in "LN":
q = p + 1
while q < len(source) and category(source[q])[0] in "LN":
q += 1
yield WordToken(source[p:q])
p = q
else:
q = p + 1
while q < len(source) and source[q] == ch:
q += 1
yield SymbolToken(source[p:q])
p = q
开发者ID:nigelsmall,项目名称:nige.tech,代码行数:26,代码来源:syntaq_experiment.py
示例9: crear_nombre_usuario
def crear_nombre_usuario(nombre, apellidos):
# En primer lugar quitamos tildes, colocamos nombres en minúsculas y :
nombre = ''.join(
(c for c in unicodedata.normalize('NFD', smart_unicode(nombre)) if
unicodedata.category(c) != 'Mn')).lower().split()
apellidos = ''.join(
(c for c in unicodedata.normalize('NFD', smart_unicode(apellidos)) if
unicodedata.category(c) != 'Mn')).lower().split()
iniciales_nombre = ''
for parte in nombre:
iniciales_nombre = iniciales_nombre + parte[0]
try:
iniciales_apellidos = apellidos[0]
except: # Estas dos líneas están para crear usuarios cuando no tienen apellidos
iniciales_apellidos = 'sin'
for ind in range(len(apellidos))[1:]:
try: # Por si acaso el usuario sólo tuviera un apellido:
iniciales_apellidos = iniciales_apellidos + apellidos[ind][0]
except IndexError:
pass
usuario = iniciales_nombre + iniciales_apellidos
valid_usuario = False
n = 1
while valid_usuario == False:
username = usuario + str(n)
try:
user = Gauser.objects.get(username=username)
n += 1
except:
valid_usuario = True
return username
开发者ID:jjmartinr01,项目名称:GaussProject,代码行数:31,代码来源:views.py
示例10: __new__
def __new__(cls,s,on_fail='die',msg=None):
if type(s) == cls: return s
cls.arg_chk(on_fail)
for k in cls.forbidden,cls.allowed:
assert type(k) == list
for ch in k: assert type(ch) == str and len(ch) == 1
try:
s = s.strip()
if type(s) != str:
s = s.decode('utf8')
for ch in s:
# Allow: (L)etter,(N)umber,(P)unctuation,(S)ymbol,(Z)space
# Disallow: (C)ontrol,(M)combining
# Combining characters create width formatting issues, so disallow them for now
if unicodedata.category(ch)[0] in 'CM':
t = { 'C':'control', 'M':'combining' }[unicodedata.category(ch)[0]]
raise ValueError('{}: {} characters not allowed'.format(ascii(ch),t))
me = str.__new__(cls,s)
if cls.max_screen_width:
me.screen_width = len(s) + len([1 for ch in s if unicodedata.east_asian_width(ch) in ('F','W')])
assert me.screen_width <= cls.max_screen_width,(
'too wide (>{} screen width)'.format(cls.max_screen_width))
else:
assert len(s) <= cls.max_len, 'too long (>{} symbols)'.format(cls.max_len)
assert len(s) >= cls.min_len, 'too short (<{} symbols)'.format(cls.min_len)
assert not cls.allowed or set(list(s)).issubset(set(cls.allowed)),\
'contains non-allowed symbols: {}'.format(' '.join(set(list(s)) - set(cls.allowed)))
assert not cls.forbidden or not any(ch in s for ch in cls.forbidden),\
"contains one of these forbidden symbols: '{}'".format("', '".join(cls.forbidden))
return me
except Exception as e:
return cls.init_fail(e,s)
开发者ID:mmgen,项目名称:mmgen,代码行数:32,代码来源:obj.py
示例11: parse
def parse(cls, string):
from unicodedata import category
parts = []
last_ch = None
for ch in string:
if last_ch is None:
parts.append([ch])
elif ch == ".":
if last_ch in ".-":
parts[-1][-1] += "0"
parts[-1].append("")
elif ch == "-":
if last_ch in ".-":
parts[-1][-1] += "0"
parts.append([""])
else:
if last_ch not in ".-" and category(ch)[0] != category(last_ch)[0]:
parts.append([ch])
else:
parts[-1][-1] += ch
last_ch = ch
for part in parts:
for i, x in enumerate(part):
try:
part[i] = int(x)
except (ValueError, TypeError):
pass
while len(part) > 1 and not part[-1]:
part[:] = part[:-1]
return cls(*map(tuple, parts))
开发者ID:neo4j-contrib,项目名称:boltkit,代码行数:30,代码来源:dist.py
示例12: get_match_list
def get_match_list(data, match_list, order_list=None, only_ascii=False, ignorecase=False):
"""
Busca coincidencias en una cadena de texto, con un diccionario de "ID" / "Listado de cadenas de busqueda":
{ "ID1" : ["Cadena 1", "Cadena 2", "Cadena 3"],
"ID2" : ["Cadena 4", "Cadena 5", "Cadena 6"]
}
El diccionario no pude contener una misma cadena de busqueda en varías IDs.
La busqueda se realiza por orden de tamaño de cadena de busqueda (de mas larga a mas corta) si una cadena coincide,
se elimina de la cadena a buscar para las siguientes, para que no se detecten dos categorias si una cadena es parte de otra:
por ejemplo: "Idioma Español" y "Español" si la primera aparece en la cadena "Pablo sabe hablar el Idioma Español"
coincidira con "Idioma Español" pero no con "Español" ya que la coincidencia mas larga tiene prioridad.
"""
import unicodedata
match_dict = dict()
matches = []
# Pasamos la cadena a unicode
data = unicode(data, "utf8")
# Pasamos el diccionario a {"Cadena 1": "ID1", "Cadena 2", "ID1", "Cadena 4", "ID2"} y los pasamos a unicode
for key in match_list:
if order_list and not key in order_list:
raise Exception("key '%s' not in match_list" % key)
for value in match_list[key]:
if value in match_dict:
raise Exception("Duplicate word in list: '%s'" % value)
match_dict[unicode(value, "utf8")] = key
# Si ignorecase = True, lo pasamos todo a mayusculas
if ignorecase:
data = data.upper()
match_dict = dict((key.upper(), match_dict[key]) for key in match_dict)
# Si ascii = True, eliminamos todos los accentos y Ñ
if only_ascii:
data = ''.join((c for c in unicodedata.normalize('NFD', data) if unicodedata.category(c) != 'Mn'))
match_dict = dict((''.join((c for c in unicodedata.normalize('NFD', key) if unicodedata.category(c) != 'Mn')),
match_dict[key]) for key in match_dict)
# Ordenamos el listado de mayor tamaño a menor y buscamos.
for match in sorted(match_dict, key=lambda x: len(x), reverse=True):
s = data
for a in matches:
s = s.replace(a, "")
if match in s:
matches.append(match)
if matches:
if order_list:
return type("Mtch_list", (),
{"key": match_dict[matches[-1]], "index": order_list.index(match_dict[matches[-1]])})
else:
return type("Mtch_list", (), {"key": match_dict[matches[-1]], "index": None})
else:
if order_list:
return type("Mtch_list", (), {"key": None, "index": len(order_list)})
else:
return type("Mtch_list", (), {"key": None, "index": None})
开发者ID:FusionwareIT,项目名称:Repository-FusionBox,代码行数:60,代码来源:descargas.py
示例13: characters
def characters(self, content):
text = content.strip()
if self._inTitle:
if self._headerProcessed:
if not self._ignoreTitle:
self._writeHtml(content)
else :
if self._headerProcessed:
if not self._ignoreText:
if len(text) > 0:
if not self._glossTitleWritten and not self._inTitle:
self._writeDefaultTitle()
if not self._inParagraph and not self._inGeneratedPara and not self._inArticle and not self._lineGroupPara and not self._inTable:
self._startGeneratedPara()
if self._endDfn:
if self._keywordTag == 'dfn':
if unicodedata.category(content[0]) == 'Pd':
self._writeHtml(' ')
elif content[0] == ' ':
if unicodedata.category(text[0]) != 'Pd':
self._writeHtml(u' \u2014')
else:
self._writeHtml(u' \u2014 ')
self._writeHtml(content)
else: # 'h4' for fb2
if unicodedata.category(text[0]) == 'Pd':
text = text[1:]
self._writeHtml(text.strip())
self._endDfn = False
else:
self._writeHtml(content)
开发者ID:davebooth,项目名称:osis-converters,代码行数:33,代码来源:glossary.py
示例14: test_characters_of_specific_groups
def test_characters_of_specific_groups():
st = characters(whitelist_categories=("Lu", "Nd"))
find_any(st, lambda c: unicodedata.category(c) == "Lu")
find_any(st, lambda c: unicodedata.category(c) == "Nd")
assert_no_examples(st, lambda c: unicodedata.category(c) not in ("Lu", "Nd"))
开发者ID:HypothesisWorks,项目名称:hypothesis-python,代码行数:7,代码来源:test_simple_characters.py
示例15: test_find_something_rare
def test_find_something_rare():
st = characters(whitelist_categories=['Zs'], min_codepoint=12288)
find(st, lambda c: unicodedata.category(c) == 'Zs')
with pytest.raises(NoSuchExample):
find(st, lambda c: unicodedata.category(c) != 'Zs')
开发者ID:AWhetter,项目名称:hypothesis-python,代码行数:7,代码来源:test_simple_characters.py
示例16: test_characters_of_specific_groups
def test_characters_of_specific_groups():
st = characters(whitelist_categories=('Lu', 'Nd'))
find(st, lambda c: unicodedata.category(c) == 'Lu')
find(st, lambda c: unicodedata.category(c) == 'Nd')
with pytest.raises(NoSuchExample):
find(st, lambda c: unicodedata.category(c) not in ('Lu', 'Nd'))
开发者ID:AWhetter,项目名称:hypothesis-python,代码行数:8,代码来源:test_simple_characters.py
示例17: test_characters_of_specific_groups
def test_characters_of_specific_groups():
st = characters(whitelist_categories=('Lu', 'Nd'))
find(st, lambda c: unicodedata.category(c) == 'Lu')
find(st, lambda c: unicodedata.category(c) == 'Nd')
assert_no_examples(
st, lambda c: unicodedata.category(c) not in ('Lu', 'Nd'))
开发者ID:doismellburning,项目名称:hypothesis,代码行数:8,代码来源:test_simple_characters.py
示例18: is_yelling
def is_yelling(stuff):
"""
:return boolean True if all letters in stuff are uppercased
"""
letters = filter(lambda c: 'L' in unicodedata.category(c), unicode(stuff)) # 'L' category is for 'letter'
if letters == u'':
return False
return all(('u' in unicodedata.category(c) for c in letters)) # 'u' category is for 'uppercase'
开发者ID:itsolutionscorp,项目名称:AutoStyle-Clustering,代码行数:8,代码来源:747cfdfae74c4b89bf39a015b942642d.py
示例19: test_exclude_characters_of_specific_groups
def test_exclude_characters_of_specific_groups():
st = characters(blacklist_categories=('Lu', 'Nd'))
find(st, lambda c: unicodedata.category(c) != 'Lu')
find(st, lambda c: unicodedata.category(c) != 'Nd')
with pytest.raises(NoSuchExample):
find(st, lambda c: unicodedata.category(c) in ('Lu', 'Nd'))
开发者ID:AWhetter,项目名称:hypothesis-python,代码行数:8,代码来源:test_simple_characters.py
示例20: combine_modifiers
def combine_modifiers(self, string):
"""
Given a string that is space-delimited on Unicode grapheme clusters,
group Unicode modifier letters with their preceding base characters,
deal with tie bars, etc.
Parameters
----------
string : str
A Unicode string tokenized into grapheme clusters to be tokenized into simple IPA.
"""
result = []
graphemes = string.split()
temp = ""
count = len(graphemes)
for grapheme in reversed(graphemes):
count -= 1
if len(grapheme) == 1 and unicodedata.category(grapheme) == "Lm" and not ord(grapheme) in [712, 716]:
temp = grapheme+temp
# hack for the cases where a space modifier is the first character in the string
if count == 0:
result[-1] = temp+result[-1]
continue
# catch and repair stress marks
if len(grapheme) == 1 and ord(grapheme) in [712, 716]:
result[-1] = grapheme+result[-1]
temp = ""
continue
# combine contour tone marks (non-accents)
if len(grapheme) == 1 and unicodedata.category(grapheme) == "Sk":
if len(result) == 0:
result.append(grapheme)
temp = ""
continue
else:
if unicodedata.category(result[-1][0]) == "Sk":
result[-1] = grapheme+result[-1]
temp = ""
continue
result.append(grapheme+temp)
temp = ""
# last check for tie bars
segments = result[::-1]
i = 0
r = []
while i < len(segments):
# tie bars
if ord(segments[i][-1]) in [865, 860]:
r.append(segments[i]+segments[i+1])
i = i+2
else:
r.append(segments[i])
i += 1
return " ".join(r)
开发者ID:FrankNagel,项目名称:orthotokenizer,代码行数:58,代码来源:tokenizer.py
注:本文中的unicodedata.category函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论