本文整理汇总了Python中w3lib.html.replace_entities函数的典型用法代码示例。如果您正苦于以下问题:Python replace_entities函数的具体用法?Python replace_entities怎么用?Python replace_entities使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了replace_entities函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: test_regular
def test_regular(self):
# regular conversions
self.assertEqual(replace_entities(u'As low as £100!'),
u'As low as \xa3100!')
self.assertEqual(replace_entities(b'As low as £100!'),
u'As low as \xa3100!')
self.assertEqual(replace_entities('redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold ½oz solid crucifix pendant'),
u'redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold \xbdoz solid crucifix pendant')
开发者ID:Preetwinder,项目名称:w3lib,代码行数:8,代码来源:test_html.py
示例2: test_illegal_entities
def test_illegal_entities(self):
self.assertEqual(replace_entities('a < b &illegal; c � six', remove_illegal=False),
u'a < b &illegal; c � six')
self.assertEqual(replace_entities('a < b &illegal; c � six', remove_illegal=True),
u'a < b c six')
self.assertEqual(replace_entities('x≤y'), u'x\u2264y')
self.assertEqual(replace_entities('xy'), u'xy')
self.assertEqual(replace_entities('xy', remove_illegal=False), u'xy')
开发者ID:Preetwinder,项目名称:w3lib,代码行数:8,代码来源:test_html.py
示例3: clean_url
def clean_url(url):
clean_url = ''
try:
clean_url = urljoin(base_url, replace_entities(clean_link(url.decode(response_encoding))))
except ValueError:
pass
return clean_url
开发者ID:AugustLONG,项目名称:scrapy,代码行数:7,代码来源:regex.py
示例4: text
def text(region):
"""Converts HTML to text. There is no attempt at formatting other than
removing excessive whitespace,
For example:
>>> t = lambda s: text(htmlregion(s))
>>> t(u'<h1>test</h1>')
u'test'
Leading and trailing whitespace are removed
>>> t(u'<h1> test</h1> ')
u'test'
Comments are removed
>>> t(u'test <!-- this is a comment --> me')
u'test me'
Text between script tags is ignored
>>> t(u"scripts are<script>n't</script> ignored")
u'scripts are ignored'
HTML entities are converted to text
>>> t(u"only £42")
u'only \\xa342'
>>> t(u"<p>The text</p><?xml:namespace blabla/><p>is here</p>")
u'The text is here'
"""
text = replace_entities(region.text_content, encoding=region.htmlpage.encoding)
return _WS.sub(u' ', text).strip()
开发者ID:scrapy,项目名称:scrapely,代码行数:30,代码来源:extractors.py
示例5: test_missing_semicolon
def test_missing_semicolon(self):
for entity, result in (
('<<!', '<<!',),
('<!', '<!',),
('A ', 'A ',),
('A!', 'A!',),
('Ah', 'Ah',),
('A!', 'A!',),
('Ax', 'Ax',),
('³!', u'\u00B3!',),
('Á!', u'\u00C1!',),
('☃!', u'\u2603!',),
('™', u'\u2122',),
('™', u'\u2122',),
):
self.assertEqual(replace_entities(entity, encoding='cp1252'), result)
self.assertEqual(replace_entities('x%sy' % entity, encoding='cp1252'), u'x%sy' % result)
开发者ID:Preetwinder,项目名称:w3lib,代码行数:17,代码来源:test_html.py
示例6: extract_raw_text
def extract_raw_text(html):
text = replace_entities(html)
text = re_clean_blanks.sub(u' ', text)
text = re_clean_comments.sub(u' ', text)
text = re_clean_javascript.sub(u' ', text)
text = re_clean_style.sub(u' ', text)
text = re_clean_balises.sub(u' ', text)
text = re_clean_blanks.sub(u' ', text).strip()
text = re_clean_multiCR.sub(u'\n', text)
return text
开发者ID:RouxRC,项目名称:gazouilleur,代码行数:10,代码来源:webmonitor.py
示例7: _extract_links
def _extract_links(self, response_text, response_url, response_encoding, base_url=None):
if base_url is None:
base_url = urljoin(response_url, self.base_url) if self.base_url else response_url
clean_url = lambda u: urljoin(base_url, replace_entities(clean_link(u.decode(response_encoding))))
clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip()
links_text = linkre.findall(response_text)
return [Link(clean_url(url).encode(response_encoding),
clean_text(text))
for url, _, text in links_text]
开发者ID:0326,项目名称:scrapy,代码行数:11,代码来源:regex.py
示例8: image_url
def image_url(txt):
"""convert text to a url
this is quite conservative, since relative urls are supported
Example:
>>> image_url('')
>>> image_url(' ')
>>> image_url(' \\n\\n ')
>>> image_url('foo-bar.jpg')
['foo-bar.jpg']
>>> image_url('/images/main_logo12.gif')
['/images/main_logo12.gif']
>>> image_url("http://www.image.com/image.jpg")
['http://www.image.com/image.jpg']
>>> image_url("http://www.domain.com/path1/path2/path3/image.jpg")
['http://www.domain.com/path1/path2/path3/image.jpg']
>>> image_url("/path1/path2/path3/image.jpg")
['/path1/path2/path3/image.jpg']
>>> image_url("path1/path2/image.jpg")
['path1/path2/image.jpg']
>>> image_url("background-image : url(http://www.site.com/path1/path2/image.jpg)")
['http://www.site.com/path1/path2/image.jpg']
>>> image_url("background-image : url('http://www.site.com/path1/path2/image.jpg')")
['http://www.site.com/path1/path2/image.jpg']
>>> image_url('background-image : url("http://www.site.com/path1/path2/image.jpg")')
['http://www.site.com/path1/path2/image.jpg']
>>> image_url("background : url(http://www.site.com/path1/path2/image.jpg)")
['http://www.site.com/path1/path2/image.jpg']
>>> image_url("background : url('http://www.site.com/path1/path2/image.jpg')")
['http://www.site.com/path1/path2/image.jpg']
>>> image_url('background : url("http://www.site.com/path1/path2/image.jpg")')
['http://www.site.com/path1/path2/image.jpg']
>>> image_url('/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350')
['/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350']
>>> image_url('http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350')
['http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350']
>>> image_url('http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80')
['http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80']
>>> image_url('../image.aspx?thumb=true&boxSize=175&img=Unknoportrait[1].jpg')
['../image.aspx?thumb=true&boxSize=175&img=Unknoportrait[1].jpg']
>>> image_url('http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff')
['http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff']
>>> image_url('http://www.site.com/image.php')
['http://www.site.com/image.php']
>>> image_url('background-image:URL(http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&defaultImage=noimage_wasserstrom)')
['http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&defaultImage=noimage_wasserstrom']
"""
imgurl = extract_image_url(txt)
return [safe_url_string(replace_entities(url(imgurl)))] if imgurl else None
开发者ID:scrapy,项目名称:scrapely,代码行数:54,代码来源:extractors.py
示例9: extract_regex
def extract_regex(regex, text, encoding="utf-8"):
"""Extract a list of unicode strings from the given text/encoding using the following policies:
* if the regex contains a named group called "extract" that will be returned
* if the regex contains multiple numbered groups, all those will be returned (flattened)
* if the regex doesn't contain any group the entire regex matching is returned
"""
if isinstance(regex, six.string_types):
regex = re.compile(regex, re.UNICODE)
try:
strings = [regex.search(text).group("extract")] # named group
except:
strings = regex.findall(text) # full regex or numbered groups
strings = flatten(strings)
if isinstance(text, six.text_type):
return [replace_entities(s, keep=["lt", "amp"]) for s in strings]
else:
return [replace_entities(to_unicode(s, encoding), keep=["lt", "amp"]) for s in strings]
开发者ID:lopuhin,项目名称:scrapy,代码行数:21,代码来源:misc.py
示例10: extract_regex
def extract_regex(regex, text, encoding='utf-8'):
"""Extract a list of unicode strings from the given text/encoding using the following policies:
* if the regex contains a named group called "extract" that will be returned
* if the regex contains multiple numbered groups, all those will be returned (flattened)
* if the regex doesn't contain any group the entire regex matching is returned
"""
if isinstance(regex, basestring):
regex = re.compile(regex, re.UNICODE)
try:
strings = [regex.search(text).group('extract')] # named group
except:
strings = regex.findall(text) # full regex or numbered groups
strings = flatten(strings)
#flatten 把列表中的列表或者字典等嵌套结构去除,返回一个统一的列表。
if isinstance(text, unicode):
return [replace_entities(s, keep=['lt', 'amp']) for s in strings]
else:
return [replace_entities(unicode(s, encoding), keep=['lt', 'amp']) for s in strings]
开发者ID:Terrenceyang213,项目名称:SourceLearningNote-Scrapy-,代码行数:21,代码来源:misc_unfinished.py
示例11: parse_item
def parse_item(self, response):
links = dict()
link_titles = set()
url = response.url.split('#')[0].lower()
url_head = url.split('/pages/')[0] + '/pages/'
title = response.xpath('//meta[@name="DC.title"]/@content').extract_first()
if title and title.endswith('- NHS Choices'):
title = title.rstrip(' NHS Choices').rstrip(' -')
subjects = response.xpath('//meta[@name="DC.Subject"][@scheme="NHSC.Ontology"]/@content').extract_first().split(', ')
subjects = [s.lower() for s in subjects if s]
if not subjects:
subjects = [title.lower()]
description = clean_text(response.xpath('//meta[@name="DC.description"]/@content').extract_first())
raw_page_content = response.xpath('//div[@class="main-content healthaz-content clear"]/.').extract_first()
page_content = clean_text(replace_entities(remove_tags(raw_page_content)))
for a in response.xpath('//div[@class="main-content healthaz-content clear"]/descendant::a'):
label = a.xpath('text()').extract_first()
href = a.xpath('@href').extract_first()
if href and label:
href = self.base_url + href.lstrip('/')
href = href.lower()
label = clean_text(label)
if '/conditions/' in href and url_head not in href:
link_titles.add(label)
if href in links:
links[href]['count'] += 1
else:
links[href] = {
'count': 1,
'label': label
}
if url_head in href and href != url:
print("********************", href)
yield scrapy.Request(href, self.parse_item)
article = NhsItem()
article['url'] = url
article['title'] = title
article['subjects'] = subjects
article['description'] = description
article['page_content'] = str(page_content)
article['links'] = links
article['link_titles'] = list(link_titles)
yield article
开发者ID:mattkohl,项目名称:nhs-choice-search,代码行数:46,代码来源:conditions.py
示例12: _has_ajaxcrawlable_meta
def _has_ajaxcrawlable_meta(text):
"""
>>> _has_ajaxcrawlable_meta('<html><head><meta name="fragment" content="!"/></head><body></body></html>')
True
>>> _has_ajaxcrawlable_meta("<html><head><meta name='fragment' content='!'></head></html>")
True
>>> _has_ajaxcrawlable_meta('<html><head><!--<meta name="fragment" content="!"/>--></head><body></body></html>')
False
>>> _has_ajaxcrawlable_meta('<html></html>')
False
"""
# Stripping scripts and comments is slow (about 20x slower than
# just checking if a string is in text); this is a quick fail-fast
# path that should work for most pages.
if 'fragment' not in text:
return False
if 'content' not in text:
return False
text = html.remove_tags_with_content(text, ('script', 'noscript'))
text = html.replace_entities(text)
text = html.remove_comments(text)
return _ajax_crawlable_re.search(text) is not None
开发者ID:01-,项目名称:scrapy,代码行数:24,代码来源:ajaxcrawl.py
示例13: clean_link
import urllib
import urlparse
from urlparse import urljoin
from w3lib.html import replace_entities
def clean_link(link_text):
return link_text.strip("\t\r\n '\"")
# 返回第一个url地址
list_first_item = lambda x:x[0] if x else None
# 将url地址组装返回,并移除空格标点 entites
clean_url = lambda base_url, u, response_encoding: urljoin(base_url, replace_entities(clean_link(u.decode(response_encoding))))
# 获取请求参数
def get_query(url, key):
bits = list(urlparse.urlparse(url))
query = urlparse.parse_qs(bits[4])
return query[key][0]
# 设置请求参数
def set_query(url, **args):
bits = list(urlparse.urlparse(url))
query = urlparse.parse_qs(bits[4])
开发者ID:dumengnan,项目名称:ohmydata_spider,代码行数:29,代码来源:select_result.py
示例14: test_returns_unicode
def test_returns_unicode(self):
# make sure it always return uncode
assert isinstance(replace_entities(b'no entities'), six.text_type)
assert isinstance(replace_entities(b'Price: £100!'), six.text_type)
assert isinstance(replace_entities(u'no entities'), six.text_type)
assert isinstance(replace_entities(u'Price: £100!'), six.text_type)
开发者ID:Preetwinder,项目名称:w3lib,代码行数:6,代码来源:test_html.py
示例15: test_encoding
def test_encoding(self):
self.assertEqual(replace_entities(b'x\x99™™y', encoding='cp1252'), \
u'x\u2122\u2122\u2122y')
开发者ID:Preetwinder,项目名称:w3lib,代码行数:3,代码来源:test_html.py
示例16: _cleanup
def _cleanup(value):
return " ".join(replace_entities(replace_tags(value)).strip().split())
开发者ID:pombredanne,项目名称:fortia,代码行数:2,代码来源:fortia-server.py
示例17: test_browser_hack
def test_browser_hack(self):
# check browser hack for numeric character references in the 80-9F range
self.assertEqual(replace_entities('x™y', encoding='cp1252'), u'x\u2122y')
self.assertEqual(replace_entities('x™y', encoding='cp1252'), u'x\u2122y')
开发者ID:Preetwinder,项目名称:w3lib,代码行数:4,代码来源:test_html.py
示例18: remove_entities
def remove_entities(text, encoding):
return replace_entities(text, keep=_ENTITIES_TO_KEEP, encoding=encoding)
开发者ID:daqv,项目名称:portia-dashboard,代码行数:2,代码来源:html.py
示例19: test_keep_entities
def test_keep_entities(self):
# keep some entities
self.assertEqual(replace_entities(b'<b>Low < High & Medium £ six</b>', keep=['lt', 'amp']),
u'<b>Low < High & Medium \xa3 six</b>')
self.assertEqual(replace_entities(u'<b>Low < High & Medium £ six</b>', keep=[u'lt', u'amp']),
u'<b>Low < High & Medium \xa3 six</b>')
开发者ID:Preetwinder,项目名称:w3lib,代码行数:6,代码来源:test_html.py
示例20: type
"""
if type(arg) is types.ListType:
return list(set(arg))
elif type(arg) is types.TupleType:
return tuple(set(arg))
return arg
def clean_link(link_text):
"""
Remove leading and trailing whitespace and punctuation
"""
return link_text.strip("\t\r\n '\"")
clean_url = lambda base_url, u, response_encoding: urljoin(
base_url, replace_entities(text=clean_link(u), encoding=response_encoding)
)
#
# clean_url = lambda base_url, u, response_encoding: urljoin(base_url,
# replace_entities(
# text=clean_link(u.decode(response_encoding, 'ignore')),
# encoding=response_encoding)
# )
"""
remove leading and trailing whitespace and punctuation and entities from the given text.
then join the base_url and the link that extract
"""
开发者ID:DASungta,项目名称:CranberrySearchEngine_Spider,代码行数:30,代码来源:select_result.py
注:本文中的w3lib.html.replace_entities函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论