本文整理汇总了Python中w3lib.url.safe_url_string函数的典型用法代码示例。如果您正苦于以下问题:Python safe_url_string函数的具体用法?Python safe_url_string怎么用?Python safe_url_string使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了safe_url_string函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: test_safe_url_port_number
def test_safe_url_port_number(self):
self.assertEqual(
safe_url_string(u"http://www.example.com:80/résumé?q=résumé"),
"http://www.example.com:80/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")
self.assertEqual(
safe_url_string(u"http://www.example.com:/résumé?q=résumé"),
"http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")
开发者ID:scrapy,项目名称:w3lib,代码行数:7,代码来源:test_url.py
示例2: test_safe_url_string_misc
def test_safe_url_string_misc(self):
# mixing Unicode and percent-escaped sequences
safeurl = safe_url_string(u"http://www.example.com/£?unit=%C2%B5")
self.assertTrue(isinstance(safeurl, str))
self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%C2%B5")
safeurl = safe_url_string(u"http://www.example.com/%C2%A3?unit=µ")
self.assertTrue(isinstance(safeurl, str))
self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%C2%B5")
开发者ID:codecov-test,项目名称:w3lib,代码行数:9,代码来源:test_url.py
示例3: _set_url
def _set_url(self, url):
if isinstance(url, str):
self._url = safe_url_string(url)
elif isinstance(url, unicode):
if self.encoding is None:
raise TypeError('Cannot convert unicode url - %s has no encoding' %
type(self).__name__)
unicode_url = url if isinstance(url, unicode) else url.decode(self.encoding)
self._url = safe_url_string(unicode_url, self.encoding)
else:
raise TypeError('Request url must be str or unicode, got %s:' % type(url).__name__)
开发者ID:zhangcheng,项目名称:scrapy,代码行数:11,代码来源:__init__.py
示例4: test_safe_url_string_bytes_input_nonutf8
def test_safe_url_string_bytes_input_nonutf8(self):
# latin1
safeurl = safe_url_string(b"http://www.example.com/\xa3?unit=\xb5")
self.assertTrue(isinstance(safeurl, str))
self.assertEqual(safeurl, "http://www.example.com/%A3?unit=%B5")
# cp1251
# >>> u'Россия'.encode('cp1251')
# '\xd0\xee\xf1\xf1\xe8\xff'
safeurl = safe_url_string(b"http://www.example.com/country/\xd0\xee\xf1\xf1\xe8\xff")
self.assertTrue(isinstance(safeurl, str))
self.assertEqual(safeurl, "http://www.example.com/country/%D0%EE%F1%F1%E8%FF")
开发者ID:codecov-test,项目名称:w3lib,代码行数:12,代码来源:test_url.py
示例5: _set_url
def _set_url(self, url):
if isinstance(url, str):
self._url = escape_ajax(safe_url_string(url))
elif isinstance(url, unicode):
if self.encoding is None:
raise TypeError('Cannot convert unicode url - %s has no encoding' %
type(self).__name__)
unicode_url = url if isinstance(url, unicode) else url.decode(self.encoding)
self._url = safe_url_string(unicode_url, self.encoding)
else:
raise TypeError('Request url must be str or unicode, got %s:' % type(url).__name__)
if ':' not in self._url:
raise ValueError('Missing scheme in request url: %s' % self._url)
开发者ID:SeaBear,项目名称:scrapy,代码行数:13,代码来源:__init__.py
示例6: test_safe_url_idna_encoding_failure
def test_safe_url_idna_encoding_failure(self):
# missing DNS label
self.assertEqual(
safe_url_string(u"http://.example.com/résumé?q=résumé"),
"http://.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")
# DNS label too long
self.assertEqual(
safe_url_string(
u"http://www.{label}.com/résumé?q=résumé".format(
label=u"example"*11)),
"http://www.{label}.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9".format(
label=u"example"*11))
开发者ID:Preetwinder,项目名称:w3lib,代码行数:13,代码来源:test_url.py
示例7: get_meta_refresh
def get_meta_refresh(text, baseurl='', encoding='utf-8'):
"""Return the http-equiv parameter of the HTML meta element from the given
HTML text and return a tuple ``(interval, url)`` where interval is an integer
containing the delay in seconds (or zero if not present) and url is a
string with the absolute url to redirect.
If no meta redirect is found, ``(None, None)`` is returned.
"""
if six.PY2:
baseurl = unicode_to_str(baseurl, encoding)
try:
text = str_to_unicode(text, encoding)
except UnicodeDecodeError:
print(text)
raise
text = remove_comments(remove_entities(text))
m = _meta_refresh_re.search(text)
if m:
interval = float(m.group('int'))
url = safe_url_string(m.group('url').strip(' "\''), encoding)
url = moves.urllib.parse.urljoin(baseurl, url)
return interval, url
else:
return None, None
开发者ID:fubuki,项目名称:w3lib,代码行数:26,代码来源:html.py
示例8: parse_all
def parse_all(self, response):
self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
#self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
if response.status / 100 != 2:
return
base_url = get_base_url(response)
base_site = get_url_site(base_url)
for sel in response.xpath('//a/@href'):
relative_url = sel.extract().encode(response.encoding)
if relative_url.startswith("javascript:") or relative_url.startswith("mailto:") or relative_url=="#":
continue
abs_url = urljoin_rfc(base_url,relative_url)
abs_url = safe_url_string(abs_url,encoding=response.encoding)
filename = abs_url.split("?")[0].split("/")[-1]
if filename :
ctype = filename.split(".")[-1].lower()
else:
ctype = None
if ctype in ["jpeg","jpg","swf","rar","zip","gz","gif","mov","png","bmp","exe","pps","db","txt","pptx",'xls',"ppt","xlsx"]:
continue
yield self.baidu_rpc_request({"url":abs_url,"src_id":22})
site = get_url_site(abs_url)
if site != base_site:
continue
if ctype in ["pdf","doc","docx","rtf",]:
continue
yield scrapy.Request(url=abs_url,callback=self.parse_all)
开发者ID:muzichenglong,项目名称:scrapyc,代码行数:31,代码来源:pdf.py
示例9: get_base_url
def get_base_url(text, baseurl='', encoding='utf-8'):
"""Return the base url if declared in the given HTML `text`,
relative to the given base url.
If no base url is found, the given `baseurl` is returned.
"""
text = to_unicode(text, encoding)
m = _baseurl_re.search(text)
if m:
return moves.urllib.parse.urljoin(
safe_url_string(baseurl),
safe_url_string(m.group(1), encoding=encoding)
)
else:
return safe_url_string(baseurl)
开发者ID:scrapy,项目名称:w3lib,代码行数:17,代码来源:html.py
示例10: std_url
def std_url(url, keep_blank_values=True, keep_fragments=False):
scheme, netloc, path, params, query, fragment = urlparse.urlparse(url)
keyvals = cgi.parse_qsl(query, keep_blank_values)
keyvals.sort()
query = urllib.urlencode(keyvals)
path = safe_url_string(path) or '/'
fragment = '' if not keep_fragments else fragment
return urlparse.urlunparse((scheme, netloc.lower(), path, params, query, fragment))
开发者ID:UncleJim,项目名称:project,代码行数:8,代码来源:utils.py
示例11: test_safe_url_idna
def test_safe_url_idna(self):
# adapted from:
# https://ssl.icu-project.org/icu-bin/idnbrowser
# http://unicode.org/faq/idn.html
# + various others
websites = (
(u'http://www.färgbolaget.nu/färgbolaget', 'http://www.xn--frgbolaget-q5a.nu/f%C3%A4rgbolaget'),
(u'http://www.räksmörgås.se/?räksmörgås=yes', 'http://www.xn--rksmrgs-5wao1o.se/?r%C3%A4ksm%C3%B6rg%C3%A5s=yes'),
(u'http://www.brændendekærlighed.com/brændende/kærlighed', 'http://www.xn--brndendekrlighed-vobh.com/br%C3%A6ndende/k%C3%A6rlighed'),
(u'http://www.예비교사.com', 'http://www.xn--9d0bm53a3xbzui.com'),
(u'http://理容ナカムラ.com', 'http://xn--lck1c3crb1723bpq4a.com'),
(u'http://あーるいん.com', 'http://xn--l8je6s7a45b.com'),
# --- real websites ---
# in practice, this redirect (301) to http://www.buecher.de/?q=b%C3%BCcher
(u'http://www.bücher.de/?q=bücher', 'http://www.xn--bcher-kva.de/?q=b%C3%BCcher'),
# Japanese
(u'http://はじめよう.みんな/?query=サ&maxResults=5', 'http://xn--p8j9a0d9c9a.xn--q9jyb4c/?query=%E3%82%B5&maxResults=5'),
# Russian
(u'http://кто.рф/', 'http://xn--j1ail.xn--p1ai/'),
(u'http://кто.рф/index.php?domain=Что', 'http://xn--j1ail.xn--p1ai/index.php?domain=%D0%A7%D1%82%D0%BE'),
# Korean
(u'http://내도메인.한국/', 'http://xn--220b31d95hq8o.xn--3e0b707e/'),
(u'http://맨체스터시티축구단.한국/', 'http://xn--2e0b17htvgtvj9haj53ccob62ni8d.xn--3e0b707e/'),
# Arabic
(u'http://nic.شبكة', 'http://nic.xn--ngbc5azd'),
# Chinese
(u'https://www.贷款.在线', 'https://www.xn--0kwr83e.xn--3ds443g'),
(u'https://www2.xn--0kwr83e.在线', 'https://www2.xn--0kwr83e.xn--3ds443g'),
(u'https://www3.贷款.xn--3ds443g', 'https://www3.xn--0kwr83e.xn--3ds443g'),
)
for idn_input, safe_result in websites:
safeurl = safe_url_string(idn_input)
self.assertEqual(safeurl, safe_result)
# make sure the safe URL is unchanged when made safe a 2nd time
for _, safe_result in websites:
safeurl = safe_url_string(safe_result)
self.assertEqual(safeurl, safe_result)
开发者ID:codecov-test,项目名称:w3lib,代码行数:45,代码来源:test_url.py
示例12: _set_url
def _set_url(self, url):
if not isinstance(url, six.string_types):
raise TypeError('Request url must be str or unicode, got %s:' % type(url).__name__)
s = safe_url_string(url, self.encoding)
self._url = escape_ajax(s)
if ':' not in self._url:
raise ValueError('Missing scheme in request url: %s' % self._url)
开发者ID:JohnDoes95,项目名称:project_parser,代码行数:9,代码来源:__init__.py
示例13: test_safe_url_string_bytes_input
def test_safe_url_string_bytes_input(self):
safeurl = safe_url_string(b"http://www.example.com/")
self.assertTrue(isinstance(safeurl, str))
self.assertEqual(safeurl, "http://www.example.com/")
# bytes input is assumed to be UTF-8
safeurl = safe_url_string(b"http://www.example.com/\xc2\xb5")
self.assertTrue(isinstance(safeurl, str))
self.assertEqual(safeurl, "http://www.example.com/%C2%B5")
# page-encoding encoded bytes still end up as UTF-8 sequences in path
safeurl = safe_url_string(b"http://www.example.com/\xb5", encoding='latin1')
self.assertTrue(isinstance(safeurl, str))
self.assertEqual(safeurl, "http://www.example.com/%C2%B5")
safeurl = safe_url_string(b"http://www.example.com/\xa3?unit=\xb5", encoding='latin1')
self.assertTrue(isinstance(safeurl, str))
self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%B5")
开发者ID:codecov-test,项目名称:w3lib,代码行数:18,代码来源:test_url.py
示例14: _set_url
def _set_url(self, url):
if not isinstance(url, six.string_types):
raise TypeError('Request url must be str or unicode, got {0!s}:'.format(type(url).__name__))
url = to_native_str(url, self.encoding)
self._url = escape_ajax(safe_url_string(url))
if ':' not in self._url:
raise ValueError('Missing scheme in request url: {0!s}'.format(self._url))
开发者ID:runt18,项目名称:scrapy,代码行数:9,代码来源:__init__.py
示例15: parse_zgyszz
def parse_zgyszz(self,response):
self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
#self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
if response.status / 100 != 2:
return
#base_site = get_url_site(base_url)
if "qklist/show-" in response.url:
base_url = get_base_url(response)
downLink = response.xpath("//div[@id='down']//a/@onclick").extract()[0]
relative_url = downLink.split("'")[1]
abs_url = urljoin_rfc(base_url,relative_url)
yield scrapy.Request(abs_url,callback=self.parse_zgyszz)
yield self.baidu_rpc_request({"url":abs_url,"src_id":22})
return
if '/upload/qklist/' in response.url:
yield self.baidu_rpc_request({"url":response.url,"src_id":22})
return
base_url = response.url
for sel in response.xpath("//div[@class='main_box']//table/tr[1]/td/a/@href"):
relative_url = sel.extract().encode(response.encoding)
if relative_url.startswith("javascript:") or relative_url.startswith("mailto:") or relative_url=="#":
continue
abs_url = urljoin_rfc(base_url,relative_url)
abs_url = safe_url_string(abs_url,encoding=response.encoding)
request = scrapy.Request(abs_url,callback=self.parse_zgyszz)
#request.meta["dont_redirect"] = True
yield request
yield self.baidu_rpc_request({"url":abs_url,"src_id":22})
for sel in response.xpath("//div[@class='flickr']/a/@href"):
relative_url = sel.extract().encode(response.encoding)
if relative_url.startswith("javascript:") or relative_url.startswith("mailto:") or relative_url=="#":
continue
abs_url = urljoin_rfc(base_url,relative_url)
abs_url = safe_url_string(abs_url,encoding=response.encoding)
request = scrapy.Request(abs_url,callback=self.parse_zgyszz)
yield request
yield self.baidu_rpc_request({"url":abs_url,"src_id":22})
开发者ID:muzichenglong,项目名称:scrapyc,代码行数:44,代码来源:pdf.py
示例16: get_base_url
def get_base_url(text, baseurl='', encoding='utf-8'):
"""Return the base url if declared in the given html text, relative to the
given base url. If no base url is found, the given base url is returned
"""
text = str_to_unicode(text, encoding)
baseurl = unicode_to_str(baseurl, encoding)
m = _baseurl_re.search(text)
if m:
baseurl = urljoin(baseurl, m.group(1).encode(encoding))
return safe_url_string(baseurl)
开发者ID:TheRinger,项目名称:find_books,代码行数:10,代码来源:html.py
示例17: test_safe_url_string_with_query
def test_safe_url_string_with_query(self):
safeurl = safe_url_string(u"http://www.example.com/£?unit=µ")
self.assertTrue(isinstance(safeurl, str))
self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%C2%B5")
safeurl = safe_url_string(u"http://www.example.com/£?unit=µ", encoding='utf-8')
self.assertTrue(isinstance(safeurl, str))
self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%C2%B5")
safeurl = safe_url_string(u"http://www.example.com/£?unit=µ", encoding='latin-1')
self.assertTrue(isinstance(safeurl, str))
self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%B5")
safeurl = safe_url_string(u"http://www.example.com/£?unit=µ", path_encoding='latin-1')
self.assertTrue(isinstance(safeurl, str))
self.assertEqual(safeurl, "http://www.example.com/%A3?unit=%C2%B5")
safeurl = safe_url_string(u"http://www.example.com/£?unit=µ", encoding='latin-1', path_encoding='latin-1')
self.assertTrue(isinstance(safeurl, str))
self.assertEqual(safeurl, "http://www.example.com/%A3?unit=%B5")
开发者ID:codecov-test,项目名称:w3lib,代码行数:20,代码来源:test_url.py
示例18: image_url
def image_url(txt):
"""convert text to a url
this is quite conservative, since relative urls are supported
Example:
>>> image_url('')
>>> image_url(' ')
>>> image_url(' \\n\\n ')
>>> image_url('foo-bar.jpg')
['foo-bar.jpg']
>>> image_url('/images/main_logo12.gif')
['/images/main_logo12.gif']
>>> image_url("http://www.image.com/image.jpg")
['http://www.image.com/image.jpg']
>>> image_url("http://www.domain.com/path1/path2/path3/image.jpg")
['http://www.domain.com/path1/path2/path3/image.jpg']
>>> image_url("/path1/path2/path3/image.jpg")
['/path1/path2/path3/image.jpg']
>>> image_url("path1/path2/image.jpg")
['path1/path2/image.jpg']
>>> image_url("background-image : url(http://www.site.com/path1/path2/image.jpg)")
['http://www.site.com/path1/path2/image.jpg']
>>> image_url("background-image : url('http://www.site.com/path1/path2/image.jpg')")
['http://www.site.com/path1/path2/image.jpg']
>>> image_url('background-image : url("http://www.site.com/path1/path2/image.jpg")')
['http://www.site.com/path1/path2/image.jpg']
>>> image_url("background : url(http://www.site.com/path1/path2/image.jpg)")
['http://www.site.com/path1/path2/image.jpg']
>>> image_url("background : url('http://www.site.com/path1/path2/image.jpg')")
['http://www.site.com/path1/path2/image.jpg']
>>> image_url('background : url("http://www.site.com/path1/path2/image.jpg")')
['http://www.site.com/path1/path2/image.jpg']
>>> image_url('/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350')
['/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350']
>>> image_url('http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350')
['http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350']
>>> image_url('http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80')
['http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80']
>>> image_url('../image.aspx?thumb=true&boxSize=175&img=Unknoportrait[1].jpg')
['../image.aspx?thumb=true&boxSize=175&img=Unknoportrait%5B1%5D.jpg']
>>> image_url('http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff')
['http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff']
>>> image_url('http://www.site.com/image.php')
['http://www.site.com/image.php']
>>> image_url('background-image:URL(http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&defaultImage=noimage_wasserstrom)')
['http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&defaultImage=noimage_wasserstrom']
"""
imgurl = extract_image_url(txt)
return [safe_url_string(remove_entities(url(imgurl)))] if imgurl else None
开发者ID:4iji,项目名称:scrapely,代码行数:54,代码来源:extractors.py
示例19: parse
def parse(self, response):
self.log("Crawled %s %d" % (response.url, response.status), level=scrapy.log.INFO)
# self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
if response.status / 100 != 2:
return
count = 0
for a in response.xpath('//a'):
text = a.xpath("string(.)").extract()
text = "".join(text).strip()
if len(text) > 5 or "PDF" not in text:
continue
href = a.xpath("@href").extract()
if len(href) != 1:
continue
href = href[0]
if (href == "#" or href.startswith("javascript")) and len(a.xpath("@onclick").extract()) == 1:
onclick = a.xpath("@onclick").extract()[0]
onclick = onclick.split(",")
if len(onclick) < 2:
continue
if onclick[0].startswith("showArticleFile"):
id = onclick[-1].split(")", 1)[0].replace("'", "")
else:
id = onclick[1].split(")", 1)[0].replace("'", "")
if "/CN/" in response.url:
pdf = response.url.split("/CN/", 1)[
0] + "/CN/article/downloadArticleFile.do?attachType=PDF&id=" + id
elif "/EN/" in response.url:
pdf = response.url.split("/EN/", 1)[
0] + "/EN/article/downloadArticleFile.do?attachType=PDF&id=" + id
else:
continue
elif "attachType=PDF&id=" in href:
abs_url = urljoin_rfc(response.url, href)
pdf = abs_url
else:
continue
# url = "http://www.zjnyxb.cn/CN/article/downloadArticleFile.do?attachType=PDF&id="+id
# print pdf
self.log("PDF_URL %s" % (pdf), level=scrapy.log.INFO)
yield self.baidu_rpc_request({"url": pdf, "src_id": 22})
count += 1
base_url = get_base_url(response)
for sel in response.xpath('//a/@href'):
relative_url = sel.extract().encode(response.encoding)
if relative_url.startswith("javascript:") or relative_url.startswith("mailto:") or relative_url == "#":
continue
abs_url = urljoin_rfc(base_url, relative_url)
abs_url = safe_url_string(abs_url, encoding=response.encoding)
yield self.baidu_rpc_request({"url": abs_url, "src_id": 22})
self.log("PDF_TOTAL %s %d" % (response.url, count), level=scrapy.log.INFO)
开发者ID:wjianwei126,项目名称:scrapyc,代码行数:53,代码来源:pdf.py
示例20: parse_cameo
def parse_cameo(self, response):
self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
#self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
if response.status / 100 != 2:
return
base_url = get_base_url(response)
for sel in response.xpath('//a/@href'):
relative_url = sel.extract().encode(response.encoding)
if relative_url.startswith("javascript:") or relative_url.startswith("mailto:") or relative_url=="#":
continue
abs_url = urljoin_rfc(base_url,relative_url)
abs_url = safe_url_string(abs_url,encoding=response.encoding)
yield self.baidu_rpc_request({"url":abs_url,"src_id":22})
开发者ID:muzichenglong,项目名称:scrapyc,代码行数:13,代码来源:pdf.py
注:本文中的w3lib.url.safe_url_string函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论