本文整理汇总了Python中unicodedata.normalize函数的典型用法代码示例。如果您正苦于以下问题:Python normalize函数的具体用法?Python normalize怎么用?Python normalize使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了normalize函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: get_cast_crew
def get_cast_crew(self,url):
request=get_file(url)
soup = BeautifulSoup(request.text)
main_dic={}
lst=[u'Cast',u'Production and Technical Credits']
for i in xrange(len(lst)):
main_dic[lst[i]]=np.nan
dic={}
try:
lst[i]=soup.findAll('div',{'id':'cast'})[i].find('h1').text
for row in soup.findAll('div',{'id':'cast'})[i].findAll('tr'):
position, filler, name = row.findAll('td')
position= unicodedata.normalize('NFKD', position.text).encode('ascii','ignore')
name = unicodedata.normalize('NFKD', name.text).encode('ascii','ignore')
if position in dic:
dic[position]+=[name]
else:
dic[position]=[name]
dic=json.dumps(dic)
except:
dic=np.nan
main_dic[lst[i]]=dic
return main_dic
开发者ID:fx2323,项目名称:project_luther,代码行数:25,代码来源:the_numbers_web_crawler.py
示例2: crawler
def crawler():
arr=["http://www.imdb.com/title/tt0111161/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=07XG6QFJZEE6BBVY6J2Z&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_1"]
fp=open('data.csv',"w")
a=csv.writer(fp,delimiter=',',quotechar="$")
visited=[]
c=0
while c<200:
page=arr.pop()
if page not in visited:
r=requests.get(page)
soup=bs4.BeautifulSoup(r.text)
rate=unicodedata.normalize('NFKD',soup.find("span",attrs={"itemprop":"ratingValue"}).string).encode('ascii','ignore')
n=float(rate)
if n>6.5 and n<=8.5:
c=c+1
name=unicodedata.normalize('NFKD',soup.find("h1",attrs={"itemprop":"name"}).text).encode('ascii','ignore')
year=soup.find(attrs={"id":"titleYear"}).text
director=unicodedata.normalize('NFKD',soup.find("span",attrs={"itemprop":"name"}).string).encode('ascii','ignore')
print([c,name,year,director,n])
a.writerow([c,name,year,director,n])
divs=soup.find_all('div',attrs={"class":"rec-title"})
links=[div.find('a')['href'] for div in divs]
links=[urljoin(page,link) for link in links]
arr=list(set(arr)|set(links))
visited.append(page)
fp.close()
开发者ID:meenal5gupta,项目名称:imdb-crawler,代码行数:26,代码来源:imdb-crawler.py
示例3: find_all_translations
def find_all_translations(soup):
file_string = ''
for word_data in soup.find_all("td", class_="list-title"):
part_link = word_data.find("a")['href']
full_link = domain + part_link
soup2 = getSoup(full_link)
translations = soup2.find("article", class_="item-page").find_all(style="text-align: center;")
for translation in translations:
tagalog = translation.find(['b', 'strong'])
new_line = translation.find('br')
if new_line:
english = new_line.next_sibling
else:
english = None
if tagalog and english and tagalog.string and english.string is not None:
if ' ' not in tagalog.string.strip() and tagalog.string is not english.string:
file_string += unicodedata.normalize('NFD', tagalog.string.strip()).encode('ascii', 'ignore').decode("utf-8") + "\n"
file_string += unicodedata.normalize('NFD', str([word.strip() for word in english.string.strip().split(',')])).encode('ascii', 'ignore').decode("utf-8") + "\n"
file_string += "\n"
f = open('translations.txt', 'a')
f.write(file_string)
f.close()
next_page_link = soup.find('li', class_='pagination-next').find('a')['href']
print('Parsing %s...'%(domain + next_page_link))
find_all_translations(getSoup(domain + next_page_link))
开发者ID:RoseySoft,项目名称:auto-downloaders,代码行数:34,代码来源:filipino-dictionary-downloader.py
示例4: ok_to_send
def ok_to_send(day_start, day_end):
now = datetime.datetime.now().time()
dstart = str.split(
unicodedata.normalize(
'NFKD', day_start).encode(
'ascii', 'ignore'), ":")
dend = str.split(
unicodedata.normalize(
'NFKD', day_end).encode(
'ascii', 'ignore'), ":")
on_time = datetime.time(int(dstart[0]), int(dstart[1]))
off_time = datetime.time(int(dend[0]), int(dend[1]))
when, matching = check_time(now, on_time, off_time)
should_I_send = False
if matching:
if when == DAY:
return True
elif when == NIGHT:
return False
else:
return False
else:
return False
开发者ID:kcjuntunen,项目名称:arduino_log,代码行数:25,代码来源:utility.py
示例5: test_greek_print_ipa
def test_greek_print_ipa(self):
"""Test the Word class's `_print_ipa` in Greek."""
w = grc.Word("élipe", grc.GREEK["Attic"]["Probert"])
output = [w._print_ipa(True), w._print_ipa(False)]
target = [unicodedata.normalize('NFC', "é.li.pe"),
unicodedata.normalize('NFC', "élipe")]
self.assertEqual(output, target)
开发者ID:TylerKirby,项目名称:cltk,代码行数:7,代码来源:test_phonology.py
示例6: freeze
def freeze(self):
"""Clean the destination and build all URLs from generators."""
remove_extra = self.app.config['FREEZER_REMOVE_EXTRA_FILES']
if not os.path.isdir(self.root):
os.makedirs(self.root)
if remove_extra:
ignore = self.app.config['FREEZER_DESTINATION_IGNORE']
previous_files = set(
# See https://github.com/SimonSapin/Frozen-Flask/issues/5
normalize('NFC', os.path.join(self.root, *name.split('/')))
for name in walk_directory(self.root, ignore=ignore))
seen_urls = set()
seen_endpoints = set()
built_files = set()
for url, endpoint in self._generate_all_urls():
seen_endpoints.add(endpoint)
if url in seen_urls:
# Don't build the same URL more than once
continue
seen_urls.add(url)
new_filename = self._build_one(url)
built_files.add(normalize('NFC', new_filename))
self._check_endpoints(seen_endpoints)
if remove_extra:
# Remove files from the previous build that are not here anymore.
for extra_file in previous_files - built_files:
os.remove(extra_file)
parent = os.path.dirname(extra_file)
if not os.listdir(parent):
# The directory is now empty, remove it.
os.removedirs(parent)
return seen_urls
开发者ID:meantheory,项目名称:Frozen-Flask,代码行数:34,代码来源:__init__.py
示例7: test_listdir2_returns_name_stat_pairs
def test_listdir2_returns_name_stat_pairs(self):
funny_unicode = u'M\u00E4kel\u00E4'
funny_utf8 = funny_unicode.encode('utf-8')
self.fs.write_file(funny_utf8, 'data')
pairs = self.fs.listdir2('.')
self.assertEqual(len(pairs), 1)
self.assertEqual(len(pairs[0]), 2)
name_utf8, st = pairs[0]
self.assertEqual(type(name_utf8), str)
name_unicode = name_utf8.decode('utf-8')
# See https://en.wikipedia.org/wiki/Unicode_equivalence for
# background. The NFKD normalisation seems to be the best way
# to ensure things work across Linux and Mac OS X both (their
# default normalisation for filenames is different).
self.assertEqual(
unicodedata.normalize('NFKD', name_unicode),
unicodedata.normalize('NFKD', funny_unicode))
self.assertTrue(hasattr(st, 'st_mode'))
self.assertFalse(hasattr(st, 'st_mtime'))
self.assertTrue(hasattr(st, 'st_mtime_sec'))
self.assertTrue(hasattr(st, 'st_mtime_nsec'))
开发者ID:obnam-mirror,项目名称:obnam,代码行数:25,代码来源:vfs.py
示例8: CrearPedidoCertificado
def CrearPedidoCertificado(self, cuit="", empresa="", nombre="pyafipws",
filename="empresa.csr"):
"Crear un certificate signing request (X509 CSR)"
from M2Crypto import RSA, EVP, X509
# create the certificate signing request (CSR):
self.x509_req = X509.Request ()
# normalizar encoding (reemplazar acentos, eñe, etc.)
if isinstance(empresa, unicode):
empresa = unicodedata.normalize('NFKD', empresa).encode('ASCII', 'ignore')
if isinstance(nombre, unicode):
nombre = unicodedata.normalize('NFKD', nombre).encode('ASCII', 'ignore')
# subjet: C=AR/O=[empresa]/CN=[nombre]/serialNumber=CUIT [nro_cuit]
x509name = X509.X509_Name ()
# default OpenSSL parameters:
kwargs = {"type": 0x1000 | 1, "len": -1, "loc": -1, "set": 0}
x509name.add_entry_by_txt(field='C', entry='AR', **kwargs)
x509name.add_entry_by_txt(field='O', entry=empresa, **kwargs)
x509name.add_entry_by_txt(field='CN', entry=nombre, **kwargs)
x509name.add_entry_by_txt(field='serialNumber', entry="CUIT %s" % str(cuit), **kwargs)
self.x509_req.set_subject_name(x509name)
# sign the request with the previously created key (CrearClavePrivada)
self.x509_req.set_pubkey (pkey=self.pkey)
self.x509_req.sign(pkey=self.pkey, md='sha256')
# save the CSR result to a file:
f = open(filename, "w")
f.write(self.x509_req.as_pem())
f.close()
return True
开发者ID:psgreco,项目名称:pyafipws,代码行数:32,代码来源:wsaa.py
示例9: add_other_bank_account
def add_other_bank_account(request):
"""
function to add a receiver of another bank to which user wants to transfer the money.
It fills in all the details of the receiver and also validates them.
"""
try:
cust_id=request.session.get('user_id')
name=request.POST["name"]
connected_acc_no1=request.POST["account_no"]
confirm_acc_no=request.POST["account_no_2"]
addressline1=request.POST["line1"]
addressline2=request.POST["line2"]
addressline3=request.POST["line3"]
IFSC_code1=request.POST["IFSC"]
limit1=request.POST["limit"]
error1="Account Confirmation Failed"
error2="Please Enter Valid numbers in fields"
error3="Please Enter numeral entries in fields"
error4="Sorry The account you wish to connect does not exist"
error6="Account Already Added"
error7="IFSC code does no exists"
if(connected_acc_no1!=confirm_acc_no):
return render_to_response("add_other_bank_account.html",{'error':error1,'STATIC_URL':"/static/"})
limit=unicodedata.normalize('NFKD', limit1).encode('ascii','ignore')
connected_acc_no=unicodedata.normalize('NFKD', connected_acc_no1).encode('ascii','ignore')
IFSC_code=unicodedata.normalize('NFKD', IFSC_code1).encode('ascii','ignore')
try:
i = float(limit)
except ValueError, TypeError:
return render_to_response("add_other_bank_account.html",{'error':error3,'STATIC_URL':"/static/"})
else:
开发者ID:pgiitu,项目名称:Online_transactions_iteration2,代码行数:32,代码来源:views.py
示例10: toRSSItem
def toRSSItem(self):
title = self.repo.tagname
if self.message and len(self.message) > 50: title += " - " + self.message[:50] + "..."
elif self.message: title += " - " + self.message
if self.dbkeywords: title += " - " + ",".join(self.dbkeywords)
description = "<pre>"
description += self.getpprint()
description += "</pre>"
title = unicodedata.normalize('NFKD', unicode(title, 'utf-8')).encode('ascii', 'ignore')
description = unicodedata.normalize('NFKD', unicode(description, 'utf-8')).encode('ascii', 'ignore')
link = ''
if self.repo.viewlink:
link = self.repo.viewlink.replace('%ID', self.uniqueid)
item = RSSItem(
title = title,
link = link,
description = description,
guid = Config.rooturl + "/commit/" + self.repo.tagname + "/" + self.uniqueid,
pubDate = unixToDatetime(self.date)
)
return item
开发者ID:sirvaliance,项目名称:code-audit-feed,代码行数:25,代码来源:commit.py
示例11: normalize_token
def normalize_token(data):
# credit: http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
data = unicodedata.normalize(
"NFC", "".join((c for c in unicodedata.normalize("NFD", data) if unicodedata.category(c) != "Mn")).lower()
)
data = re.sub(ur"['’]", "", data)
return data
开发者ID:bydesign,项目名称:openscriptures,代码行数:7,代码来源:import_helpers.py
示例12: __init__
def __init__(self):
if xbmc:
self.RssFeedsPath = xbmc.translatePath('special://userdata/RssFeeds.xml').decode("utf-8")
else:
self.RssFeedsPath = r'C:\Documents and Settings\Xerox\Application Data\XBMC\userdata\RssFeeds.xml'
sane = self.checkRssFeedPathSanity()
if sane:
try:
self.feedsTree = parse(self.RssFeedsPath)
except:
log('[script] RSS Editor --> Failed to parse ' + unicodedata.normalize( 'NFKD', self.RssFeedsPath ).encode( 'ascii', 'ignore' ))
regen = xbmcgui.Dialog().yesno(getLS(40), getLS(51), getLS(52), getLS(53))
if regen:
log('[script] RSS Editor --> Attempting to Regenerate RssFeeds.xml')
xml = '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n<rssfeeds>\n\
<!-- RSS feeds. To have multiple feeds, just add a feed to the set. You can also have multiple sets. !-->\n\
<!-- To use different sets in your skin, each must be called from skin with a unique id. !-->\n\
<set id="1">\n <feed updateinterval="30">http://feeds.feedburner.com/xbmc</feed>\n </set>\n</rssfeeds>'
f = open(self.RssFeedsPath, 'w')
f.write(xml)
f.close()
self.__init__()
else:
log('[script] RSS Editor --> User opted to not regenerate RssFeeds.xml. Script Exiting')
self.feedsTree = False
if self.feedsTree:
self.feedsList = self.getCurrentRssFeeds()
else:
self.feedsTree = False
self.feedsList = False
log('[SCRIPT] RSS Editor --> Could not open ' + unicodedata.normalize( 'NFKD', self.RssFeedsPath ).encode( 'ascii', 'ignore' ) +'. Either the file does not exist, or its size is zero.')
开发者ID:noba3,项目名称:KoTos,代码行数:31,代码来源:xmlParser.py
示例13: redirect_if_needed
def redirect_if_needed(self, i):
params = {}
need_redirect = False
for k, v in i.items():
if k in plurals:
params[k] = None
k = plurals[k]
need_redirect = True
if isinstance(v, list):
if v == []:
continue
clean = [normalize('NFC', b.strip()) for b in v]
if clean != v:
need_redirect = True
if len(clean) == 1 and clean[0] == u'':
clean = None
else:
clean = normalize('NFC', v.strip())
if clean == '':
need_redirect = True
clean = None
if clean != v:
need_redirect = True
params[k] = clean
if need_redirect:
raise web.seeother(web.changequery(**params))
开发者ID:bfalling,项目名称:openlibrary,代码行数:26,代码来源:code.py
示例14: noDiacritics
def noDiacritics(s):
"""Removes any diacritics"""
# sanity check
if s is None:
return None
# try the right way first
try:
strAux = unicode(s, 'utf-8')
# remove some chars
strAux = strAux.replace(unichr(0xba), "") # 4o
strAux = strAux.replace(unichr(0xaa), "") # 4a
# normalization
ret = unicodedata.normalize('NFKD', strAux)
ret = ret.encode('ascii', 'ignore')
except:
ret = None
# try as a unicode encoded string
if ret is None:
try:
strAux = s.decode(s, 'utf-8')
# remove some chars
strAux = strAux.replace(unichr(0xba), "") # 4o
strAux = strAux.replace(unichr(0xaa), "") # 4a
# normalization
ret = unicodedata.normalize('NFKD', strAux)
ret = ret.encode('ascii', 'ignore')
except:
ret = s # return as received
return ret
开发者ID:MGDevelopment,项目名称:library,代码行数:33,代码来源:__init__.py
示例15: tokenizeComparison
def tokenizeComparison(self, given, correct):
# compare in NFC form so accents appear correct
given = ucd.normalize("NFC", given)
correct = ucd.normalize("NFC", correct)
s = difflib.SequenceMatcher(None, given, correct, autojunk=False)
givenElems = []
correctElems = []
givenPoint = 0
correctPoint = 0
offby = 0
def logBad(old, new, str, array):
if old != new:
array.append((False, str[old:new]))
def logGood(start, cnt, str, array):
if cnt:
array.append((True, str[start:start+cnt]))
for x, y, cnt in s.get_matching_blocks():
# if anything was missed in correct, pad given
if cnt and y-offby > x:
givenElems.append((False, "-"*(y-x-offby)))
offby = y-x
# log any proceeding bad elems
logBad(givenPoint, x, given, givenElems)
logBad(correctPoint, y, correct, correctElems)
givenPoint = x+cnt
correctPoint = y+cnt
# log the match
logGood(x, cnt, given, givenElems)
logGood(y, cnt, correct, correctElems)
return givenElems, correctElems
开发者ID:ACEfanatic02,项目名称:anki,代码行数:30,代码来源:reviewer.py
示例16: artist_search
def artist_search(results, media, lang, artist_name):
# Precompose.
try:
artist_name = unicodedata.normalize('NFKD', artist_name.decode('utf-8'))
except UnicodeError:
artist_name = unicodedata.normalize('NFKD', artist_name)
# Strip diacritics.
stripped = u''
for i in range(len(artist_name)):
point = artist_name[i]
if not unicodedata.combining(point):
stripped += point
artist_name = stripped
json_obj = JSON.ObjectFromURL('http://127.0.0.1:32400/services/vevo/search?q=%s&artistsLimit=6&videosLimit=1' % (String.Quote(artist_name)))
score = 100
normalized_artist_name = Core.messaging.call_external_function('com.plexapp.agents.plexmusic', 'MessageKit:NormalizeArtist', kwargs = dict(artist=artist_name))
for artist in json_obj['artists']:
# Require a perfect match after normalization to avoid false positives.
normalized_artist_result = Core.messaging.call_external_function('com.plexapp.agents.plexmusic', 'MessageKit:NormalizeArtist', kwargs = dict(artist=artist['name']))
Log('Sanity checking normalized artist: %s against Vevo result: %s' % (normalized_artist_name, normalized_artist_result))
if normalized_artist_name == normalized_artist_result:
results.add(SearchResult(
id = artist['urlSafeName'],
score = score
))
score = score - 1
开发者ID:kmoore134,项目名称:plexmediaserver-freebsd-10.1-amd64,代码行数:32,代码来源:__init__.py
示例17: clean_song_data
def clean_song_data(self, artist, title):
# convert to lowercase
artist = artist.lower()
title = title.lower()
# remove accents
artist = unicodedata.normalize('NFKD', artist)
artist = "".join([c for c in artist if not unicodedata.combining(c)])
title = unicodedata.normalize('NFKD', title)
title = "".join([c for c in title if not unicodedata.combining(c)])
if self.ignore_brackets:
LYRICS_TITLE_STRIP.append("\(.*\)")
# replace ampersands and the like
for exp in LYRICS_ARTIST_REPLACE:
artist = re.sub(exp[0], exp[1], artist)
for exp in LYRICS_TITLE_REPLACE:
title = re.sub(exp[0], exp[1], title)
# strip things like "(live at Somewhere)", "(acoustic)", etc
for exp in LYRICS_TITLE_STRIP:
title = re.sub (exp, '', title)
# compress spaces
title = title.strip()
artist = artist.strip()
return (artist, title)
开发者ID:Neptilo,项目名称:lLyrics,代码行数:29,代码来源:lLyrics.py
示例18: fromUser
def fromUser(self, screen_name, tweets_number=10, is_bot=False):
user = self.createUser(screen_name, is_bot)
tweets = self.twitter_client.user_timeline(screen_name=screen_name, count=tweets_number)
for i, status in enumerate(tweets):
tweet = status._json
text = tweet['text']
date = tweet['created_at']
entities = tweet['entities']
user_mentions = entities['user_mentions']
mentions_list = []
if len(user_mentions) > 0:
for mention in user_mentions:
mentions_list.append(mention['screen_name'])
text_string = unicodedata.normalize('NFKD', text).encode('ascii','ignore')
date_string = unicodedata.normalize('NFKD', date).encode('ascii','ignore')
name_mentions_string = ",".join(mentions_list)
Tweet.create(
user = user,
text = text_string,
date = date_string,
source = status.source,
mentions = name_mentions_string
)
开发者ID:AlexSoudant,项目名称:twitter-bot-detection,代码行数:27,代码来源:tweetimporter.py
示例19: test_names
def test_names(self, data, time_locale):
# GH 17354
# Test .weekday_name, .day_name(), .month_name
with tm.assert_produces_warning(FutureWarning,
check_stacklevel=False):
assert data.weekday_name == 'Monday'
if time_locale is None:
expected_day = 'Monday'
expected_month = 'August'
else:
with tm.set_locale(time_locale, locale.LC_TIME):
expected_day = calendar.day_name[0].capitalize()
expected_month = calendar.month_name[8].capitalize()
result_day = data.day_name(time_locale)
result_month = data.month_name(time_locale)
# Work around https://github.com/pandas-dev/pandas/issues/22342
# different normalizations
if not PY2:
expected_day = unicodedata.normalize("NFD", expected_day)
expected_month = unicodedata.normalize("NFD", expected_month)
result_day = unicodedata.normalize("NFD", result_day,)
result_month = unicodedata.normalize("NFD", result_month)
assert result_day == expected_day
assert result_month == expected_month
# Test NaT
nan_ts = Timestamp(NaT)
assert np.isnan(nan_ts.day_name(time_locale))
assert np.isnan(nan_ts.month_name(time_locale))
开发者ID:TomAugspurger,项目名称:pandas,代码行数:34,代码来源:test_timestamp.py
示例20: _getPDFText
def _getPDFText(self, filename, d):
logger.debug(u"filename: %s" % filename)
newparatextlist = list()
try:
pdfDoc = PdfFileReader(file(filename, u"rb"))
pdfDict = pdfDoc.getDocumentInfo()
for x in pdfDict.keys():
d.addConceptKeyType(x[1:], pdfDict[x])
# c.logConcepts()
for page in pdfDoc.pages:
text = page.extractText()
if not isinstance(text, str):
unicodedata.normalize(u'NFKD', text).encode(u'ascii', u'ignore')
logger.debug(u"PDF : %s" % text)
newparatextlist.append(text + u". ")
return newparatextlist
except Exception, msg:
logger.error(u"%s" % msg)
开发者ID:Darth-Neo,项目名称:DirCrawler,代码行数:27,代码来源:nl_phase_a_DirCrawl.py
注:本文中的unicodedata.normalize函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论