本文整理汇总了Python中urllib.parse.urldefrag函数的典型用法代码示例。如果您正苦于以下问题:Python urldefrag函数的具体用法?Python urldefrag怎么用?Python urldefrag使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了urldefrag函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: startElementNS
def startElementNS(self, name, qname, attrs):
stack = self.stack
stack.append(ElementHandler())
current = self.current
parent = self.parent
base = attrs.get(BASE, None)
if base is not None:
base, frag = urldefrag(base)
if parent and parent.base:
base = urljoin(parent.base, base)
else:
systemId = self.locator.getPublicId() \
or self.locator.getSystemId()
if systemId:
base = urljoin(systemId, base)
else:
if parent:
base = parent.base
if base is None:
systemId = self.locator.getPublicId() \
or self.locator.getSystemId()
if systemId:
base, frag = urldefrag(systemId)
current.base = base
language = attrs.get(LANG, None)
if language is None:
if parent:
language = parent.language
current.language = language
current.start(name, qname, attrs)
开发者ID:0038lana,项目名称:Test-Task,代码行数:30,代码来源:rdfxml.py
示例2: __init__
def __init__(self, url, previous=None, **info):
# Apply the simple idempotent optimizations to all urls (no need to
# ever deal with "HTTP://.."). This means case-sensitivity, and a
# whole lot of other things that the urlnorm library will do for us.
# We call this the original url, even though it is a bit of a lie.
try:
self.original_url = urlnorm.norm(url)
except urlnorm.InvalidUrl as e:
raise urlnorm.InvalidUrl('{}: {}'.format(e, url))
# For the normalized url that we'll be exposing, remove the
# fragment, and treat https and http the same.
url, fragment = urldefrag(self.original_url)
self.lossy_url_data = {'fragment': fragment}
if url.startswith('https:'):
url = 'http' + url[5:]
self.lossy_url_data.update({'protocol': 'https'})
self.url = url
self.set_previous(previous)
self.info = info
self.post = None
# Runtime data
self.response = None
self.exception = None
self.retries = 0
开发者ID:miracle2k,项目名称:track0,代码行数:27,代码来源:spider.py
示例3: getlinks
def getlinks(pageurl, pageresponse, domain):
"""Returns a list of links from from this page to be crawled.
pageurl = URL of this page
pageresponse = page content; response object from requests module
domain = domain being crawled (None to return links to *any* domain)
"""
soup = bs4.BeautifulSoup(pageresponse.text, "html.parser")
# get target URLs for all links on the page
links = [a.attrs.get('href') for a in soup.select('a[href]')]
# remove fragment identifiers
links = [urldefrag(link)[0] for link in links]
# remove any empty strings
links = [link for link in links if link]
# if it's a relative link, change to absolute
links = [link if bool(urlparse(link).netloc) else urljoin(pageurl, link) \
for link in links]
# if only crawing a single domain, remove links to other domains
if domain:
links = [link for link in links if urlparse(link).netloc == domain]
return links
开发者ID:enamoni,项目名称:CrawlerPy3,代码行数:26,代码来源:assignment.py
示例4: get_div_link
def get_div_link(self, tip):
tag_a = tip.parent.find('a', class_='qlink')
if tag_a:
url = tag_a.get('href')
return urldefrag(url)[0]
else:
return ''
开发者ID:littlezz,项目名称:IslandCollection,代码行数:7,代码来源:adnmb.py
示例5: url
def url(self, name, force=False):
"""
Returns the real URL in DEBUG mode.
"""
if settings.DEBUG and not force:
hashed_name, fragment = name, ''
else:
clean_name, fragment = urldefrag(name)
if urlsplit(clean_name).path.endswith('/'): # don't hash paths
hashed_name = name
else:
cache_key = self.cache_key(name)
hashed_name = self.cache.get(cache_key)
if hashed_name is None:
hashed_name = self.hashed_name(clean_name).replace('\\', '/')
# set the cache if there was a miss
# (e.g. if cache server goes down)
self.cache.set(cache_key, hashed_name)
final_url = super(CachedFilesMixin, self).url(hashed_name)
# Special casing for a @font-face hack, like url(myfont.eot?#iefix")
# http://www.fontspring.com/blog/the-new-bulletproof-font-face-syntax
query_fragment = '?#' in name # [sic!]
if fragment or query_fragment:
urlparts = list(urlsplit(final_url))
if fragment and not urlparts[4]:
urlparts[4] = fragment
if query_fragment and not urlparts[3]:
urlparts[2] += '?'
final_url = urlunsplit(urlparts)
return unquote(final_url)
开发者ID:nai-central,项目名称:django-mediafiles,代码行数:33,代码来源:storage.py
示例6: oauth_callback
def oauth_callback():
if not settings.OAUTH:
abort(404)
resp = oauth.provider.authorized_response()
if resp is None or isinstance(resp, OAuthException):
log.warning("Failed OAuth: %r", resp)
return Unauthorized("Authentication has failed.")
response = signals.handle_oauth_session.send(provider=oauth.provider,
oauth=resp)
for (_, role) in response:
if role is None:
continue
db.session.commit()
update_role(role)
log.info("Logged in: %r", role)
request.authz = Authz.from_role(role)
record_audit(Audit.ACT_LOGIN)
token = request.authz.to_token(role=role)
token = token.decode('utf-8')
state = request.args.get('state')
next_url = get_best_next_url(state, request.referrer)
next_url, _ = urldefrag(next_url)
next_url = '%s#token=%s' % (next_url, token)
return redirect(next_url)
log.error("No OAuth handler for %r was installed.", oauth.provider.name)
return Unauthorized("Authentication has failed.")
开发者ID:pudo,项目名称:aleph,代码行数:29,代码来源:sessions_api.py
示例7: replace_refs
def replace_refs(cls, obj, _recursive=False, **kwargs):
"""
Returns a deep copy of `obj` with all contained JSON reference objects
replaced with :class:`JsonRef` instances.
:param obj: If this is a JSON reference object, a :class:`JsonRef`
instance will be created. If `obj` is not a JSON reference object,
a deep copy of it will be created with all contained JSON
reference objects replaced by :class:`JsonRef` instances
:param base_uri: URI to resolve relative references against
:param loader: Callable that takes a URI and returns the parsed JSON
(defaults to global ``jsonloader``, a :class:`JsonLoader` instance)
:param jsonschema: Flag to turn on `JSON Schema mode
<http://json-schema.org/latest/json-schema-core.html#anchor25>`_.
'id' keyword changes the `base_uri` for references contained within
the object
:param load_on_repr: If set to ``False``, :func:`repr` call on a
:class:`JsonRef` object will not cause the reference to be loaded
if it hasn't already. (defaults to ``True``)
"""
store = kwargs.setdefault("_store", _URIDict())
base_uri, frag = urlparse.urldefrag(kwargs.get("base_uri", ""))
store_uri = None # If this does not get set, we won't store the result
if not frag and not _recursive:
store_uri = base_uri
try:
if kwargs.get("jsonschema") and isinstance(obj["id"], basestring):
kwargs["base_uri"] = urlparse.urljoin(
kwargs.get("base_uri", ""), obj["id"]
)
store_uri = kwargs["base_uri"]
except (TypeError, LookupError):
pass
try:
if not isinstance(obj["$ref"], basestring):
raise TypeError
except (TypeError, LookupError):
pass
else:
return cls(obj, **kwargs)
# If our obj was not a json reference object, iterate through it,
# replacing children with JsonRefs
kwargs["_recursive"] = True
path = list(kwargs.pop("_path", ()))
if isinstance(obj, Mapping):
obj = type(obj)(
(k, cls.replace_refs(v, _path=path+[k], **kwargs))
for k, v in iteritems(obj)
)
elif isinstance(obj, Sequence) and not isinstance(obj, basestring):
obj = type(obj)(
cls.replace_refs(v, _path=path+[i], **kwargs) for i, v in enumerate(obj)
)
if store_uri is not None:
store[store_uri] = obj
return obj
开发者ID:benzhou1,项目名称:jsonref,代码行数:60,代码来源:jsonref.py
示例8: getlinks
def getlinks(pageurl, domain, soup):
"""Returns a list of links from from this page to be crawled.
pageurl = URL of this page
domain = domain being crawled (None to return links to *any* domain)
soup = BeautifulSoup object for this page
"""
# get target URLs for all links on the page
links = [a.attrs.get('href') for a in soup.select('a[href]')]
# remove fragment identifiers
links = [urldefrag(link)[0] for link in links]
# remove any empty strings
links = [link for link in links if link]
# if it's a relative link, change to absolute
links = [link if bool(urlparse(link).netloc) else urljoin(pageurl, link) \
for link in links]
# if only crawing a single domain, remove links to other domains
if domain:
links = [link for link in links if samedomain(urlparse(link).netloc, domain)]
return links
开发者ID:dmahugh,项目名称:crawlerino,代码行数:26,代码来源:crawlerino.py
示例9: validate_url
def validate_url(url, parent_url='http:'):
"""
Validate a URL to be a string having an explicit recognized scheme.
Arguments:
url: string URL
parent_url: optional string URL from which to inherit an implicit
scheme.
Returns: dict having:
valid: boolean truth value.
url: string modified URL.
"""
if bytes == type(url):
url = url.decode()
parsed_url = urlparse(url)
if 0 < len(parsed_url.path) and '/' == parsed_url.path[0]:
url = urldefrag(urljoin(parent_url, url))[0]
elif not parsed_url.scheme:
parent_scheme = urlparse(parent_url).scheme or 'http'
url = parent_scheme + ':' + url
parsed_url = urlparse(url)
valid = parsed_url.scheme in ('http', 'https', '') and \
bool(parsed_url.netloc)
return {'valid': valid, 'url': url}
开发者ID:samalba,项目名称:image-spider,代码行数:33,代码来源:validate_url.py
示例10: validate
def validate(url):
if url in visitedUrls: return
visitedUrls.append(url)
try:
content = urlopen(url).read().decode("utf8")
except:
# Assume the content is binary.
return
wikiUrls = []
invalidUrls = []
# This may see redundant, but without the `.find_all('a')`, soup will also
# contain the `DocType` element which does not have an `href` attribute.
# See <http://stackoverflow.com/questions/17943992/beautifulsoup-and-soupstrainer-for-getting-links-dont-work-with-hasattr-returni>.
soup = BeautifulSoup(content, parse_only=SoupStrainer('a', href=True)).find_all('a')
for externalUrl in soup:
fullExternalUrl = urljoin(url, urldefrag(externalUrl['href']).url)
if baseUrl in fullExternalUrl and \
not fullExternalUrl.endswith('/_history'):
if externalUrl.has_attr('class') and 'absent' in externalUrl['class']:
invalidUrls.append(fullExternalUrl)
else:
wikiUrls.append(fullExternalUrl)
if len(invalidUrls) > 0:
invalidWikiPages.append((url, invalidUrls))
for wikiUrl in wikiUrls:
if wikiUrl not in visitedUrls:
validate(wikiUrl)
开发者ID:bamos,项目名称:github-wiki-link-validator,代码行数:31,代码来源:link-validator.py
示例11: _parse
def _parse(self, page: BeautifulSoup, url):
seasons = OrderedDict()
eqg = OrderedSet()
child = page.select_one("#WikiaArticle h2")
season = child.text
while child.next_sibling:
child = child.next_sibling
if child.name == "table":
for a in child.find_all("a", string="Transcript"):
if not a.has_attr("class") or "new" not in a["class"]:
episode_url, fragment = urldefrag(a["href"])
episode_url = urljoin(url, episode_url)
if "Equestria Girls" not in season:
if season not in seasons:
seasons[season] = OrderedSet()
seasons[season].append(episode_url)
else:
eqg.append(episode_url)
continue
if child.name == "h2":
season = child.text
continue
seasons["Equestria Girls"] = eqg
return seasons
开发者ID:ZhangYiJiang,项目名称:mlp-visualization,代码行数:29,代码来源:mlp_models.py
示例12: resolving
def resolving(self, ref):
"""
Context manager which resolves a JSON ``ref`` and enters the
resolution scope of this ref.
:argument str ref: reference to resolve
"""
full_uri = urlparse.urljoin(self.resolution_scope, ref)
uri, fragment = urlparse.urldefrag(full_uri)
if uri in self.store:
document = self.store[uri]
elif not uri or uri == self.base_uri:
document = self.referrer
else:
document = self.resolve_remote(uri)
old_base_uri, old_referrer = self.base_uri, self.referrer
self.base_uri, self.referrer = uri, document
try:
with self.in_scope(uri):
yield self.resolve_fragment(document, fragment)
finally:
self.base_uri, self.referrer = old_base_uri, old_referrer
开发者ID:alexstrat,项目名称:jsonschema,代码行数:26,代码来源:jsonschema.py
示例13: _url
def _url(self, hashed_name_func, name, force=False, hashed_files=None):
"""
Return the non-hashed URL in DEBUG mode.
"""
if settings.DEBUG and not force:
hashed_name, fragment = name, ''
else:
clean_name, fragment = urldefrag(name)
if urlsplit(clean_name).path.endswith('/'): # don't hash paths
hashed_name = name
else:
args = (clean_name,)
if hashed_files is not None:
args += (hashed_files,)
hashed_name = hashed_name_func(*args)
final_url = super().url(hashed_name)
# Special casing for a @font-face hack, like url(myfont.eot?#iefix")
# http://www.fontspring.com/blog/the-new-bulletproof-font-face-syntax
query_fragment = '?#' in name # [sic!]
if fragment or query_fragment:
urlparts = list(urlsplit(final_url))
if fragment and not urlparts[4]:
urlparts[4] = fragment
if query_fragment and not urlparts[3]:
urlparts[2] += '?'
final_url = urlunsplit(urlparts)
return unquote(final_url)
开发者ID:Damgaard,项目名称:django,代码行数:30,代码来源:storage.py
示例14: extract_domains
def extract_domains(site_text):
domains = set()
only_a_tags = SoupStrainer("a")
for link in BeautifulSoup(site_text, "html.parser", parse_only=only_a_tags):
if link.has_attr('href') and urlparse(link["href"]).scheme not in ["", "mailto"]:
domains.add(urldefrag(link["href"])[0])
return list(domains)
开发者ID:creade,项目名称:pl-crawler,代码行数:7,代码来源:crawler.py
示例15: splitDecodeFragment
def splitDecodeFragment(url):
if url is None: # urldefrag returns byte strings for none, instead of unicode strings
return _STR_UNICODE(""), _STR_UNICODE("")
urlPart, fragPart = urldefrag(url)
if isPy3:
return (urlPart, unquote(fragPart, "utf-8", errors=None))
else:
return _STR_UNICODE(urlPart), unquote(_STR_UNICODE(fragPart), "utf-8", errors=None)
开发者ID:jasonleinbach-wf,项目名称:Arelle-1,代码行数:8,代码来源:UrlUtil.py
示例16: _meta_schemas
def _meta_schemas():
"""
Collect the urls and meta schemas from each known validator.
"""
meta_schemas = (v.META_SCHEMA for v in validators.values())
return dict((urlparse.urldefrag(m["id"])[0], m) for m in meta_schemas)
开发者ID:alexstrat,项目名称:jsonschema,代码行数:8,代码来源:jsonschema.py
示例17: test_spider
def test_spider(client, app, check_external_links):
"""Check that all links work
Spiders the site, making sure all internal links point to existing pages.
Includes fragments: any #hash in a link must correspond to existing element
with id.
If check_external_links is true, checks external links as well.
"""
to_visit = {'http://localhost/'}
visited = set()
external = set()
wanted_fragments = collections.defaultdict(set)
page_ids = {}
def recording_url_for(*args, **kwargs):
url = flask.url_for(*args, **kwargs)
if url not in visited:
to_visit.add(urljoin('http://localhost/', url))
return url
app.jinja_env.globals['url_for'] = recording_url_for
while to_visit:
url = to_visit.pop()
if url in visited:
continue
visited.add(url)
links = []
parsed = urlparse(url)
if parsed.netloc == 'localhost':
print('visit', url)
page_ids[url] = []
check_url(client, url, links, page_ids[url])
for link in links:
fullurl = urljoin('http://localhost/', url)
fullurl = urljoin(fullurl, link)
result = urldefrag(fullurl)
defrag = result.url
fragment = result.fragment
if fragment:
wanted_fragments[defrag].add(fragment)
if defrag not in visited:
to_visit.add(defrag)
else:
if parsed.scheme in ('http', 'https'):
external.add(url)
else:
print('ignore', url)
for url, fragments in wanted_fragments.items():
assert fragments <= set(page_ids[url])
if check_external_links:
for url in external:
print('check', url)
check_external_link(url)
开发者ID:encukou,项目名称:pyvo.cz,代码行数:58,代码来源:test_generic.py
示例18: absolutize
def absolutize(self, uri, defrag=1):
base = urljoin("file:", pathname2url(os.getcwd()))
result = urljoin("%s/" % base, uri, allow_fragments=not defrag)
if defrag:
result = urldefrag(result)[0]
if not defrag:
if uri and uri[-1] == "#" and result[-1] != "#":
result = "%s#" % result
return URIRef(result)
开发者ID:pkuyken,项目名称:RDFTranslator,代码行数:9,代码来源:namespace.py
示例19: html_to_lxml
def html_to_lxml(url, text, clean=False):
"""Parse plain-text HTML into an `lxml` tree."""
if clean:
text = _text_from_sp(('pandoc', '--from=html', '--to=html5'),
text.encode())
html = lxml.html.document_fromstring(text)
# Endless loops ahoy
html.rewrite_links(lambda s: '' if urldefrag(s).url == url else s,
base_href=url)
return html
开发者ID:openpatata,项目名称:openpatata-scrapers,代码行数:10,代码来源:text_utils.py
示例20: extract_links
def extract_links(self, html):
log.debug("extract_links")
soup = BeautifulSoup(html, "lxml")
links = []
for link in soup.find_all('a', href=True):
href = link.get('href')
href = urljoin(self.seed, href)
href, _ = urldefrag(href)
links.append(href)
log.info("found {}".format(len(links)))
return links
开发者ID:pauloalem,项目名称:sieve,代码行数:11,代码来源:crawler.py
注:本文中的urllib.parse.urldefrag函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论