• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

Python parse.urldefrag函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中urllib.parse.urldefrag函数的典型用法代码示例。如果您正苦于以下问题:Python urldefrag函数的具体用法?Python urldefrag怎么用?Python urldefrag使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了urldefrag函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: startElementNS

 def startElementNS(self, name, qname, attrs):
     stack = self.stack
     stack.append(ElementHandler())
     current = self.current
     parent = self.parent
     base = attrs.get(BASE, None)
     if base is not None:
         base, frag = urldefrag(base)
         if parent and parent.base:
             base = urljoin(parent.base, base)
         else:
             systemId = self.locator.getPublicId() \
                 or self.locator.getSystemId()
             if systemId:
                 base = urljoin(systemId, base)
     else:
         if parent:
             base = parent.base
         if base is None:
             systemId = self.locator.getPublicId() \
                 or self.locator.getSystemId()
             if systemId:
                 base, frag = urldefrag(systemId)
     current.base = base
     language = attrs.get(LANG, None)
     if language is None:
         if parent:
             language = parent.language
     current.language = language
     current.start(name, qname, attrs)
开发者ID:0038lana,项目名称:Test-Task,代码行数:30,代码来源:rdfxml.py


示例2: __init__

    def __init__(self, url, previous=None, **info):
        # Apply the simple idempotent optimizations to all urls (no need to
        # ever deal with "HTTP://.."). This means case-sensitivity, and a
        # whole lot of other things that the urlnorm library will do for us.
        # We call this the original url, even though it is a bit of a lie.
        try:
            self.original_url = urlnorm.norm(url)
        except urlnorm.InvalidUrl as e:
            raise urlnorm.InvalidUrl('{}: {}'.format(e, url))

        # For the normalized url that we'll be exposing, remove the
        # fragment, and treat https and http the same.
        url, fragment = urldefrag(self.original_url)
        self.lossy_url_data = {'fragment': fragment}
        if url.startswith('https:'):
            url = 'http' + url[5:]
            self.lossy_url_data.update({'protocol': 'https'})
        self.url = url

        self.set_previous(previous)
        self.info = info
        self.post = None

        # Runtime data
        self.response = None
        self.exception = None
        self.retries = 0
开发者ID:miracle2k,项目名称:track0,代码行数:27,代码来源:spider.py


示例3: getlinks

def getlinks(pageurl, pageresponse, domain):
    """Returns a list of links from from this page to be crawled.
    pageurl = URL of this page
    pageresponse = page content; response object from requests module
    domain = domain being crawled (None to return links to *any* domain)
    """
    soup = bs4.BeautifulSoup(pageresponse.text, "html.parser")

    # get target URLs for all links on the page
    links = [a.attrs.get('href') for a in soup.select('a[href]')]

    # remove fragment identifiers
    links = [urldefrag(link)[0] for link in links]

    # remove any empty strings
    links = [link for link in links if link]

    # if it's a relative link, change to absolute
    links = [link if bool(urlparse(link).netloc) else urljoin(pageurl, link) \
        for link in links]

    # if only crawing a single domain, remove links to other domains
    if domain:
        links = [link for link in links if urlparse(link).netloc == domain]

    return links
开发者ID:enamoni,项目名称:CrawlerPy3,代码行数:26,代码来源:assignment.py


示例4: get_div_link

 def get_div_link(self, tip):
     tag_a = tip.parent.find('a', class_='qlink')
     if tag_a:
         url = tag_a.get('href')
         return urldefrag(url)[0]
     else:
         return ''
开发者ID:littlezz,项目名称:IslandCollection,代码行数:7,代码来源:adnmb.py


示例5: url

    def url(self, name, force=False):
        """
        Returns the real URL in DEBUG mode.
        """
        if settings.DEBUG and not force:
            hashed_name, fragment = name, ''
        else:
            clean_name, fragment = urldefrag(name)
            if urlsplit(clean_name).path.endswith('/'):  # don't hash paths
                hashed_name = name
            else:
                cache_key = self.cache_key(name)
                hashed_name = self.cache.get(cache_key)
                if hashed_name is None:
                    hashed_name = self.hashed_name(clean_name).replace('\\', '/')
                    # set the cache if there was a miss
                    # (e.g. if cache server goes down)
                    self.cache.set(cache_key, hashed_name)

        final_url = super(CachedFilesMixin, self).url(hashed_name)

        # Special casing for a @font-face hack, like url(myfont.eot?#iefix")
        # http://www.fontspring.com/blog/the-new-bulletproof-font-face-syntax
        query_fragment = '?#' in name  # [sic!]
        if fragment or query_fragment:
            urlparts = list(urlsplit(final_url))
            if fragment and not urlparts[4]:
                urlparts[4] = fragment
            if query_fragment and not urlparts[3]:
                urlparts[2] += '?'
            final_url = urlunsplit(urlparts)

        return unquote(final_url)
开发者ID:nai-central,项目名称:django-mediafiles,代码行数:33,代码来源:storage.py


示例6: oauth_callback

def oauth_callback():
    if not settings.OAUTH:
        abort(404)

    resp = oauth.provider.authorized_response()
    if resp is None or isinstance(resp, OAuthException):
        log.warning("Failed OAuth: %r", resp)
        return Unauthorized("Authentication has failed.")

    response = signals.handle_oauth_session.send(provider=oauth.provider,
                                                 oauth=resp)
    for (_, role) in response:
        if role is None:
            continue
        db.session.commit()
        update_role(role)
        log.info("Logged in: %r", role)
        request.authz = Authz.from_role(role)
        record_audit(Audit.ACT_LOGIN)
        token = request.authz.to_token(role=role)
        token = token.decode('utf-8')
        state = request.args.get('state')
        next_url = get_best_next_url(state, request.referrer)
        next_url, _ = urldefrag(next_url)
        next_url = '%s#token=%s' % (next_url, token)
        return redirect(next_url)

    log.error("No OAuth handler for %r was installed.", oauth.provider.name)
    return Unauthorized("Authentication has failed.")
开发者ID:pudo,项目名称:aleph,代码行数:29,代码来源:sessions_api.py


示例7: replace_refs

    def replace_refs(cls, obj, _recursive=False, **kwargs):
        """
        Returns a deep copy of `obj` with all contained JSON reference objects
        replaced with :class:`JsonRef` instances.

        :param obj: If this is a JSON reference object, a :class:`JsonRef`
            instance will be created. If `obj` is not a JSON reference object,
            a deep copy of it will be created with all contained JSON
            reference objects replaced by :class:`JsonRef` instances
        :param base_uri: URI to resolve relative references against
        :param loader: Callable that takes a URI and returns the parsed JSON
            (defaults to global ``jsonloader``, a :class:`JsonLoader` instance)
        :param jsonschema: Flag to turn on `JSON Schema mode
            <http://json-schema.org/latest/json-schema-core.html#anchor25>`_.
            'id' keyword changes the `base_uri` for references contained within
            the object
        :param load_on_repr: If set to ``False``, :func:`repr` call on a
            :class:`JsonRef` object will not cause the reference to be loaded
            if it hasn't already. (defaults to ``True``)

        """

        store = kwargs.setdefault("_store", _URIDict())
        base_uri, frag = urlparse.urldefrag(kwargs.get("base_uri", ""))
        store_uri = None  # If this does not get set, we won't store the result
        if not frag and not _recursive:
            store_uri = base_uri
        try:
            if kwargs.get("jsonschema") and isinstance(obj["id"], basestring):
                kwargs["base_uri"] = urlparse.urljoin(
                    kwargs.get("base_uri", ""), obj["id"]
                )
                store_uri = kwargs["base_uri"]
        except (TypeError, LookupError):
            pass

        try:
            if not isinstance(obj["$ref"], basestring):
                raise TypeError
        except (TypeError, LookupError):
            pass
        else:
            return cls(obj, **kwargs)

        # If our obj was not a json reference object, iterate through it,
        # replacing children with JsonRefs
        kwargs["_recursive"] = True
        path = list(kwargs.pop("_path", ()))
        if isinstance(obj, Mapping):
            obj = type(obj)(
                (k, cls.replace_refs(v, _path=path+[k], **kwargs))
                for k, v in iteritems(obj)
            )
        elif isinstance(obj, Sequence) and not isinstance(obj, basestring):
            obj = type(obj)(
                cls.replace_refs(v, _path=path+[i], **kwargs) for i, v in enumerate(obj)
            )
        if store_uri is not None:
            store[store_uri] = obj
        return obj
开发者ID:benzhou1,项目名称:jsonref,代码行数:60,代码来源:jsonref.py


示例8: getlinks

def getlinks(pageurl, domain, soup):
    """Returns a list of links from from this page to be crawled.

    pageurl = URL of this page
    domain = domain being crawled (None to return links to *any* domain)
    soup = BeautifulSoup object for this page
    """

    # get target URLs for all links on the page
    links = [a.attrs.get('href') for a in soup.select('a[href]')]

    # remove fragment identifiers
    links = [urldefrag(link)[0] for link in links]

    # remove any empty strings
    links = [link for link in links if link]

    # if it's a relative link, change to absolute
    links = [link if bool(urlparse(link).netloc) else urljoin(pageurl, link) \
        for link in links]

    # if only crawing a single domain, remove links to other domains
    if domain:
        links = [link for link in links if samedomain(urlparse(link).netloc, domain)]

    return links
开发者ID:dmahugh,项目名称:crawlerino,代码行数:26,代码来源:crawlerino.py


示例9: validate_url

def validate_url(url, parent_url='http:'):

    """
    Validate a URL to be a string having an explicit recognized scheme.

    Arguments:
        url: string URL
        parent_url: optional string URL from which to inherit an implicit
                    scheme.

    Returns: dict having:
        valid: boolean truth value.
        url: string modified URL.
    """

    if bytes == type(url):
        url = url.decode()

    parsed_url = urlparse(url)

    if 0 < len(parsed_url.path) and '/' == parsed_url.path[0]:
        url = urldefrag(urljoin(parent_url, url))[0]

    elif not parsed_url.scheme:
        parent_scheme = urlparse(parent_url).scheme or 'http'
        url = parent_scheme + ':' + url

    parsed_url = urlparse(url)

    valid = parsed_url.scheme in ('http', 'https', '') and \
            bool(parsed_url.netloc)

    return {'valid': valid, 'url': url}
开发者ID:samalba,项目名称:image-spider,代码行数:33,代码来源:validate_url.py


示例10: validate

def validate(url):
  if url in visitedUrls: return

  visitedUrls.append(url)
  try:
    content = urlopen(url).read().decode("utf8")
  except:
    # Assume the content is binary.
    return

  wikiUrls = []
  invalidUrls = []
  # This may see redundant, but without the `.find_all('a')`, soup will also
  # contain the `DocType` element which does not have an `href` attribute.
  # See <http://stackoverflow.com/questions/17943992/beautifulsoup-and-soupstrainer-for-getting-links-dont-work-with-hasattr-returni>.
  soup = BeautifulSoup(content, parse_only=SoupStrainer('a', href=True)).find_all('a')
  for externalUrl in soup:
    fullExternalUrl = urljoin(url, urldefrag(externalUrl['href']).url)
    if baseUrl in fullExternalUrl and \
        not fullExternalUrl.endswith('/_history'):
      if externalUrl.has_attr('class') and 'absent' in externalUrl['class']:
        invalidUrls.append(fullExternalUrl)
      else:
        wikiUrls.append(fullExternalUrl)

  if len(invalidUrls) > 0:
    invalidWikiPages.append((url, invalidUrls))

  for wikiUrl in wikiUrls:
    if wikiUrl not in visitedUrls:
      validate(wikiUrl)
开发者ID:bamos,项目名称:github-wiki-link-validator,代码行数:31,代码来源:link-validator.py


示例11: _parse

    def _parse(self, page: BeautifulSoup, url):
        seasons = OrderedDict()
        eqg = OrderedSet()

        child = page.select_one("#WikiaArticle h2")
        season = child.text

        while child.next_sibling:
            child = child.next_sibling

            if child.name == "table":
                for a in child.find_all("a", string="Transcript"):
                    if not a.has_attr("class") or "new" not in a["class"]:
                        episode_url, fragment = urldefrag(a["href"])
                        episode_url = urljoin(url, episode_url)
                        if "Equestria Girls" not in season:
                            if season not in seasons:
                                seasons[season] = OrderedSet()
                            seasons[season].append(episode_url)
                        else:
                            eqg.append(episode_url)
                continue

            if child.name == "h2":
                season = child.text
                continue

        seasons["Equestria Girls"] = eqg
        return seasons
开发者ID:ZhangYiJiang,项目名称:mlp-visualization,代码行数:29,代码来源:mlp_models.py


示例12: resolving

    def resolving(self, ref):
        """
        Context manager which resolves a JSON ``ref`` and enters the
        resolution scope of this ref.

        :argument str ref: reference to resolve

        """

        full_uri = urlparse.urljoin(self.resolution_scope, ref)
        uri, fragment = urlparse.urldefrag(full_uri)

        if uri in self.store:
            document = self.store[uri]
        elif not uri or uri == self.base_uri:
            document = self.referrer
        else:
            document = self.resolve_remote(uri)

        old_base_uri, old_referrer = self.base_uri, self.referrer
        self.base_uri, self.referrer = uri, document
        try:
            with self.in_scope(uri):
                yield self.resolve_fragment(document, fragment)
        finally:
            self.base_uri, self.referrer = old_base_uri, old_referrer
开发者ID:alexstrat,项目名称:jsonschema,代码行数:26,代码来源:jsonschema.py


示例13: _url

    def _url(self, hashed_name_func, name, force=False, hashed_files=None):
        """
        Return the non-hashed URL in DEBUG mode.
        """
        if settings.DEBUG and not force:
            hashed_name, fragment = name, ''
        else:
            clean_name, fragment = urldefrag(name)
            if urlsplit(clean_name).path.endswith('/'):  # don't hash paths
                hashed_name = name
            else:
                args = (clean_name,)
                if hashed_files is not None:
                    args += (hashed_files,)
                hashed_name = hashed_name_func(*args)

        final_url = super().url(hashed_name)

        # Special casing for a @font-face hack, like url(myfont.eot?#iefix")
        # http://www.fontspring.com/blog/the-new-bulletproof-font-face-syntax
        query_fragment = '?#' in name  # [sic!]
        if fragment or query_fragment:
            urlparts = list(urlsplit(final_url))
            if fragment and not urlparts[4]:
                urlparts[4] = fragment
            if query_fragment and not urlparts[3]:
                urlparts[2] += '?'
            final_url = urlunsplit(urlparts)

        return unquote(final_url)
开发者ID:Damgaard,项目名称:django,代码行数:30,代码来源:storage.py


示例14: extract_domains

def extract_domains(site_text):
    domains = set()
    only_a_tags = SoupStrainer("a")
    for link in BeautifulSoup(site_text, "html.parser", parse_only=only_a_tags):
        if link.has_attr('href') and urlparse(link["href"]).scheme not in ["", "mailto"]:
            domains.add(urldefrag(link["href"])[0])
    return list(domains)
开发者ID:creade,项目名称:pl-crawler,代码行数:7,代码来源:crawler.py


示例15: splitDecodeFragment

def splitDecodeFragment(url):
    if url is None: # urldefrag returns byte strings for none, instead of unicode strings
        return _STR_UNICODE(""), _STR_UNICODE("")
    urlPart, fragPart = urldefrag(url)
    if isPy3:
        return (urlPart, unquote(fragPart, "utf-8", errors=None))
    else:
        return _STR_UNICODE(urlPart), unquote(_STR_UNICODE(fragPart), "utf-8", errors=None)
开发者ID:jasonleinbach-wf,项目名称:Arelle-1,代码行数:8,代码来源:UrlUtil.py


示例16: _meta_schemas

def _meta_schemas():
    """
    Collect the urls and meta schemas from each known validator.

    """

    meta_schemas = (v.META_SCHEMA for v in validators.values())
    return dict((urlparse.urldefrag(m["id"])[0], m) for m in meta_schemas)
开发者ID:alexstrat,项目名称:jsonschema,代码行数:8,代码来源:jsonschema.py


示例17: test_spider

def test_spider(client, app, check_external_links):
    """Check that all links work

    Spiders the site, making sure all internal links point to existing pages.
    Includes fragments: any #hash in a link must correspond to existing element
    with id.

    If check_external_links is true, checks external links as well.
    """
    to_visit = {'http://localhost/'}
    visited = set()
    external = set()

    wanted_fragments = collections.defaultdict(set)
    page_ids = {}

    def recording_url_for(*args, **kwargs):
        url = flask.url_for(*args, **kwargs)
        if url not in visited:
            to_visit.add(urljoin('http://localhost/', url))
        return url

    app.jinja_env.globals['url_for'] = recording_url_for

    while to_visit:
        url = to_visit.pop()
        if url in visited:
            continue
        visited.add(url)
        links = []
        parsed = urlparse(url)
        if parsed.netloc == 'localhost':
            print('visit', url)
            page_ids[url] = []
            check_url(client, url, links, page_ids[url])
            for link in links:
                fullurl = urljoin('http://localhost/', url)
                fullurl = urljoin(fullurl, link)
                result = urldefrag(fullurl)
                defrag = result.url
                fragment = result.fragment
                if fragment:
                    wanted_fragments[defrag].add(fragment)
                if defrag not in visited:
                    to_visit.add(defrag)
        else:
            if parsed.scheme in ('http', 'https'):
                external.add(url)
            else:
                print('ignore', url)

    for url, fragments in wanted_fragments.items():
        assert fragments <= set(page_ids[url])

    if check_external_links:
        for url in external:
            print('check', url)
            check_external_link(url)
开发者ID:encukou,项目名称:pyvo.cz,代码行数:58,代码来源:test_generic.py


示例18: absolutize

 def absolutize(self, uri, defrag=1):
     base = urljoin("file:", pathname2url(os.getcwd()))
     result = urljoin("%s/" % base, uri, allow_fragments=not defrag)
     if defrag:
         result = urldefrag(result)[0]
     if not defrag:
         if uri and uri[-1] == "#" and result[-1] != "#":
             result = "%s#" % result
     return URIRef(result)
开发者ID:pkuyken,项目名称:RDFTranslator,代码行数:9,代码来源:namespace.py


示例19: html_to_lxml

def html_to_lxml(url, text, clean=False):
    """Parse plain-text HTML into an `lxml` tree."""
    if clean:
        text = _text_from_sp(('pandoc', '--from=html', '--to=html5'),
                             text.encode())
    html = lxml.html.document_fromstring(text)
    # Endless loops ahoy
    html.rewrite_links(lambda s: '' if urldefrag(s).url == url else s,
                       base_href=url)
    return html
开发者ID:openpatata,项目名称:openpatata-scrapers,代码行数:10,代码来源:text_utils.py


示例20: extract_links

 def extract_links(self, html):
     log.debug("extract_links")
     soup = BeautifulSoup(html, "lxml")
     links = []
     for link in soup.find_all('a', href=True):
         href = link.get('href')
         href = urljoin(self.seed, href)
         href, _ = urldefrag(href)
         links.append(href)
     log.info("found {}".format(len(links)))
     return links
开发者ID:pauloalem,项目名称:sieve,代码行数:11,代码来源:crawler.py



注:本文中的urllib.parse.urldefrag函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python parse.urlencode函数代码示例发布时间:2022-05-27
下一篇:
Python parse.url_quote函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap