• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

Python parse.urlsplit函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中urllib.parse.urlsplit函数的典型用法代码示例。如果您正苦于以下问题:Python urlsplit函数的具体用法?Python urlsplit怎么用?Python urlsplit使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了urlsplit函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: _main

def _main():
    base_url = sys.argv[1]
    soup = bs4.BeautifulSoup(urlopen(base_url), from_encoding="windows-1252")
    index_urls = [urljoin(base_url, h3("a")[0]["href"]) for h3 in soup("h3")]
    for index_url in index_urls:
        try:
            resp = urlopen(index_url)
        except HTTPError as err:
            print(err, err.url, file=sys.stderr)
            print("Skipping..", file=sys.stderr)
            continue
        index_soup = bs4.BeautifulSoup(resp, from_encoding="iso-8859-1")
        index_path = urlsplit(index_url).path
        index_filepath = os.path.normpath("." + index_path)
        try:
            os.makedirs(os.path.dirname(index_filepath))
        except OSError as e:
            if e.errno != errno.EEXIST:
                raise e
        for issue_url in iter_issue_urls(index_soup):
            issue_url = urljoin(index_url, issue_url)
            try:
                resp = urlopen(issue_url)
            except HTTPError as err:
                print(err, err.url, file=sys.stderr)
                print("Skipping..", file=sys.stderr)
                continue
            issue_soup = bs4.BeautifulSoup(resp, from_encoding="windows-1252")
            issue_path = urlsplit(issue_url).path
            issue_filepath = os.path.normpath("." + issue_path)
            with open(issue_filepath, "w") as f:
                print(klupu.clean_soup(issue_soup), file=f)
        with open(index_filepath, "w") as f:
            print(klupu.clean_soup(index_soup), file=f)
开发者ID:imclab,项目名称:klupu,代码行数:34,代码来源:fetch.py


示例2: oauth

 def oauth(self, req, credentials = None, params = {}):
     #NOTE: While flickr supports HTTPS in its oauth endpoints, flickr
     #thinks that the HTTPS endpoints are being accessed via HTTP, and thus
     #constructs the signature base string accordingly, which
     #will hence not match the signature base string generated by
     #pyoauth1client. We solve this by replacing HTTPS with HTTP
     #when generating the signature base string, and then revert the change
     #after the base string is generated. This way the signature
     #base string will match the one generated by flickr even though
     #we are accessing the endpoints via HTTPS for ADDED SECURITY!!!111one
     x = urlsplit(req.url)
     if x.scheme == "https":
         #Remove the HTTPS Scheme
         https = True
         x = x._replace(scheme = "http")
         req = req._replace(url = urlunsplit(x))
     else:
         https = False
     y = super().oauth(req, credentials, params)
     if https:
         #Add back the HTTPS scheme
         x = urlsplit(y.url)
         x = x._replace(scheme = "https")
         y = y._replace(url = urlunsplit(x))
     return y
开发者ID:pyokagan,项目名称:pyoauth1client,代码行数:25,代码来源:__init__.py


示例3: main

def main(GET):
	global mail,error,error_list
	parser = argparse.ArgumentParser(description='Scrape a simple site.')
	parser.add_argument('url', help='the URL at which to begin')
	start_url = parser.parse_args().url
	starting_netloc = urlsplit(start_url).netloc
	url_filter = (lambda url: urlsplit(url).netloc == starting_netloc)
	scrape((GET, start_url), url_filter)
	print ("\n\nresult--------------------------------\nerror:%d" %(error))
	count = 1;
	for url in error_list:
		print(url)
	print("\n")
	for url in mail:
		print("[%d]url:%s" %(count,url))
		data = mail[url][0]
		if data:
			tmp = []
			for val in data:
				
				if not val in tmp:
					print (val)
				tmp.append(val)
			
		else:
			print("None")
		print ("")
		count+=1
开发者ID:cheersa,项目名称:python,代码行数:28,代码来源:hw3.py


示例4: __form_data

 def __form_data(text, formid, params, soup=None, form_url=None):
     if type(params) is not dict:
         raise TypeError('Params must be a dict')
     if soup is None:
         soup = BeautifulSoup(text, 'html.parser')
     form = soup.find('form', attrs={'id': formid})
     action = form.attrs.get('action')
     if not urlsplit(action).netloc:
         if form_url is None or not urlsplit(form_url).netloc:
             raise ValueError('kwarg form_url must be specified if form '
                              'action lacks a host')
         action = urljoin(form_url, action)
     inputs = form.find_all('input') + form.find_all('textarea')
     for i in inputs:
         try:
             name = i.attrs['name']
             type_ = i.attrs['type']
             value = params.get(name)
             if type_ == 'submit':
                 continue
             elif type_ == 'hidden':
                 value = i.attrs['value'] if value is None else value
             elif value is None:
                 raise ValueError('kwarg params dictionary is missing a '
                                  'value for a non-hidden field')
         except KeyError:
             pass
         else:
             params[name] = value
     return Session.FormInfo(params=params, post_url=action)
开发者ID:lachm,项目名称:fbbot,代码行数:30,代码来源:infra.py


示例5: clean_url

def clean_url(value):
    """
    Taken from Django' URLField, this helps to normalize URLs. Raises a
    ValueError if an invalid url is passed.

    Example:

    >>> clean_url("www.google.com")
    "http://www.google.com"

    >>> clean_url("_.com")
    Traceback (most recent call last):
      File "<stdin>", line 1, in <module>
    ValueError: Enter a valid URL.
    """
    if value:
        value = value.strip()
        value = value.encode('ascii', 'ignore').decode("utf-8")
        url_fields = list(urlsplit((value)))
        if not url_fields[0]:
            # If no URL scheme given, assume http://
            url_fields[0] = 'http'
        if not url_fields[1]:
            # Assume that if no domain is provided, that the path segment
            # contains the domain.
            url_fields[1] = url_fields[2]
            url_fields[2] = ''
            # Rebuild the url_fields list, since the domain segment may now
            # contain the path too.
            url_fields = list(urlsplit((urlunsplit(url_fields))))
        if not url_fields[2]:
            # the path portion may need to be added before query params
            url_fields[2] = '/'
        value = urlunsplit(url_fields)
    return value
开发者ID:TrackMaven,项目名称:trackmaven-common,代码行数:35,代码来源:urls.py


示例6: assertRedirects

    def assertRedirects(self, response, expected_url, status_code=302,
                        target_status_code=200, host=None):
        """Asserts that a response redirected to a specific URL, and that the
        redirect URL can be loaded.

        Note that assertRedirects won't work for external links since it uses
        TestClient to do a request.
        """
        self.assertEqual(response.status_code, status_code,
            ("Response didn't redirect as expected: Response code was %d"
             " (expected %d)" % (response.status_code, status_code)))
        url = response['Location']
        scheme, netloc, path, query, fragment = urlsplit(url)
        e_scheme, e_netloc, e_path, e_query, e_fragment = urlsplit(expected_url)
        if not (e_scheme or e_netloc):
            expected_url = urlunsplit(('http', host or 'testserver', e_path,
                    e_query, e_fragment))
        self.assertEqual(url, expected_url,
            "Response redirected to '%s', expected '%s'" % (url, expected_url))

        # Get the redirection page, using the same client that was used
        # to obtain the original response.
        redirect_response = response.client.get(path, QueryDict(query))
        self.assertEqual(redirect_response.status_code, target_status_code,
            ("Couldn't retrieve redirection page '%s': response code was %d"
             " (expected %d)") %
                 (path, redirect_response.status_code, target_status_code))
开发者ID:gitdlam,项目名称:geraldo,代码行数:27,代码来源:testcases.py


示例7: parse_url

def parse_url(link):
    """Say Website Title information in channel"""
    baseurl = '{uri.scheme}://{uri.netloc}'.format(uri=urlsplit(link))
    path = urlsplit(link).path
    query = '?{uri.query}'.format(uri=urlsplit(link))
    try:
        headers = {'Accept-Encoding': 'utf-8',
                   'User-Agent': 'Mozilla/5.0'}
        response = get(baseurl + path + query, headers=headers)
    except:
        return
    if response.headers["Content-Type"] and "text/html" in response.headers["Content-Type"]:
        try:
            URL = BeautifulSoup(response.text, "html.parser")
        except:
            return
        if not URL.title:
            return
        if URL.title.string is None:
            return
        if len(URL.title.string) > 250:
            title=URL.title.string[0:250] + '…'
        else:
            title=URL.title.string
        return title.replace('\n', ' ').strip() + " (" + urlsplit(link).netloc + ")"
    else:
        return
开发者ID:meskarune,项目名称:autobot,代码行数:27,代码来源:url_announce.py


示例8: find_pingback_urls

    def find_pingback_urls(self, urls):
        """Find the pingback urls of each urls"""
        pingback_urls = {}

        for url in urls:
            try:
                page = urlopen(url)
                headers = page.info()

                if 'text/' not in headers.get('Content-Type', '').lower():
                    continue

                server_url = headers.get('X-Pingback')
                if not server_url:
                    server_url = self.find_pingback_href(page.read())

                if server_url:
                    server_url_splitted = urlsplit(server_url)
                    if not server_url_splitted.netloc:
                        url_splitted = urlsplit(url)
                        server_url = '%s://%s%s' % (url_splitted.scheme,
                                                    url_splitted.netloc,
                                                    server_url)
                    pingback_urls[url] = server_url
            except IOError:
                pass
        return pingback_urls
开发者ID:sergeny,项目名称:django-blog-zinnia,代码行数:27,代码来源:ping.py


示例9: run

    def run(self):
        while True:
            # grabs url from queue
            level, u = self.input_q.get()

            main = '{0.scheme}://{0.netloc}/'.format(urlsplit(u))

            # fetching urls
            if level < MAX_URL_LEVEL:
                html = _get_content(u)
                if not isinstance(html, list):
                    soup = bs(html)
                    for link in soup.find_all('a'):
                        href = link.get('href')
                        
                        if not href or len(href) < 2:
                            continue

                        # Check if URL is relative
                        elif not urlsplit(href)[0] and not urlsplit(href)[1]:
                            self.output_q.put((level+1, _url_discard(urljoin(u, href))))
                        
                        elif href.startswith(main):
                            self.output_q.put((level+1, _url_discard(href)))
                else:
                    # Place for possible error logs (:
                    pass

            # signals to queue job is done
            self.input_q.task_done()
开发者ID:komarovf,项目名称:uwc2015,代码行数:30,代码来源:parser.py


示例10: test_flow

    def test_flow(self):
        url = self.sp.make_auth_req()
        status, headers, _ = self.getPage(url)
        assert status == '303 See Other'

        url = self.get_redirect_location(headers)
        req = parse_qs(urlsplit(url).query)
        assert 'SAMLRequest' in req
        assert 'RelayState' in req

        action, body = self.idp.handle_auth_req(req['SAMLRequest'][0],
                                                req['RelayState'][0],
                                                BINDING_HTTP_REDIRECT,
                                                'test1')
        status, headers, body = self.getPage(action, method='POST',
                                             body=urlencode(body))
        assert status == '302 Found'

        url = self.get_redirect_location(headers)
        req = parse_qs(urlsplit(url).query)
        assert 'SAMLResponse' in req
        assert 'RelayState' in req
        resp = self.sp.parse_authn_request_response(req['SAMLResponse'][0],
                                                    BINDING_HTTP_REDIRECT)
        identity = resp.ava
        assert identity["displayName"][0] == "Test1"
        assert identity["sn"][0] == "[email protected]"
        assert identity['o'][0] == "Small university"
开发者ID:ibrsp,项目名称:s2sproxy,代码行数:28,代码来源:test_proxy_server.py


示例11: _url

    def _url(self, hashed_name_func, name, force=False, hashed_files=None):
        """
        Return the non-hashed URL in DEBUG mode.
        """
        if settings.DEBUG and not force:
            hashed_name, fragment = name, ''
        else:
            clean_name, fragment = urldefrag(name)
            if urlsplit(clean_name).path.endswith('/'):  # don't hash paths
                hashed_name = name
            else:
                args = (clean_name,)
                if hashed_files is not None:
                    args += (hashed_files,)
                hashed_name = hashed_name_func(*args)

        final_url = super().url(hashed_name)

        # Special casing for a @font-face hack, like url(myfont.eot?#iefix")
        # http://www.fontspring.com/blog/the-new-bulletproof-font-face-syntax
        query_fragment = '?#' in name  # [sic!]
        if fragment or query_fragment:
            urlparts = list(urlsplit(final_url))
            if fragment and not urlparts[4]:
                urlparts[4] = fragment
            if query_fragment and not urlparts[3]:
                urlparts[2] += '?'
            final_url = urlunsplit(urlparts)

        return unquote(final_url)
开发者ID:Damgaard,项目名称:django,代码行数:30,代码来源:storage.py


示例12: https_open

    def https_open(self, request):
        """
        Send an HTTP request, which can be either GET or POST,
        depending on req.has_data()

        Args:
            request - instance of urllib2.Request
        """
        full_url = request.get_full_url()
        url_parts = parse.urlsplit(full_url)
        robo = None
        if url_parts.netloc in self.robots:
            robo = self.robots[url_parts.netloc]
        else:
            # Getting request url, for checking robots.txt
            host = parse.urlsplit(full_url)[1]
            rurl = parse.urlunparse(("http", host, "/robots.txt", "", ""))
            robo = reppy.cache.RobotsCache()
            robo.fetch(rurl, self.agent_name)
            self.robots[url_parts.netloc] = robo

        # Is url allow for crawler in robots.txt
        if robo.allowed(full_url, self.agent_name):
            # Return result of request
            return request.HTTPHandler.https_open(self, request)
        else:
            raise RuntimeError('Forbidden by robots.txt')
开发者ID:Armoken,项目名称:Learning,代码行数:27,代码来源:crawler.py


示例13: get_fetcher

def get_fetcher(url=None, *, item=dict()):
	RTMP_PROTOCOLS = {'rtmp', 'rtmpt', 'rtmpe', 'rtmpte'}
	
	url = item.get("url", url)
	if urlsplit(url).scheme in RTMP_PROTOCOLS:
		return RtmpFetcher(url, live=True)
	
	auth = comm.get_auth()
	protocol = urlsplit(auth['server']).scheme
	if protocol in RTMP_PROTOCOLS:
		(url, ext) = url.rsplit('.', 1) # strip the extension (.flv or .mp4)
		url = auth['playpath_prefix'] + url

		if ext == 'mp4':
			url = 'mp4:' + url

		rtmp_url = auth['rtmp_url']
		token = auth.get('token')
		if token:
		    # Cannot use urljoin() because
		    # the RTMP scheme would have to be added to its whitelist
		    rtmp_url += '?auth=' + token
		
		return RtmpFetcher(rtmp_url, playpath=url)
	else:
		return HdsFetcher(url, auth)
开发者ID:timwhite,项目名称:python-iview,代码行数:26,代码来源:fetch.py


示例14: zoom_article

 def zoom_article(self, ticket_id, article_id):
     art_descr = self.__db.article_description(article_id)
     if art_descr[4] & ART_TEXT:
         return eval(self.__db.article_message(article_id))
     self.echo("Zoom article:", ticket_id, article_id)
     url_beg = urlsplit(self.runtime.get("site"))[:3]
     params = (
         ("Action", "AgentTicketZoom"), ("Subaction", "ArticleUpdate"),
         ("TicketID", ticket_id), ("ArticleID", article_id),
         ("OTRSAgentInterface", self.runtime["OTRSAgentInterface"]))
     url = urlunsplit(url_beg + (urlencode(params), ""))
     pg = TicketsPage(self.core)
     page = pg.load(url)
     if page is None:
         return
     mail_header = page.get("mail_header", [])
     if "mail_src" in page:
         url = urlunsplit(url_beg[:2] + urlsplit(page["mail_src"])[2:])
         self.echo("Get message:", url)
         pg = MessagePage(self.core)
         try:
             mail_text = pg.load(url)
         except LoginError:
             mail_text = pg.login()
     else:
         mail_text = page["message_text"]
     if mail_header:
         mail_text.insert(0, ("\n",))
     for i in reversed(mail_header):
         mail_text.insert(0, ("%s\t%s\n" % i,))
     shrink_tupled_text(mail_text)
     self.__db.article_message(article_id, repr(mail_text))
     return mail_text
开发者ID:Lysovenko,项目名称:OTRS_US,代码行数:33,代码来源:msg_ldr.py


示例15: main

def main(GET):
    parser = argparse.ArgumentParser(description='Scrape a simple site.')
    parser.add_argument('url', help='the URL at which to begin')
    start_url = parser.parse_args().url
    starting_netloc = urlsplit(start_url).netloc
    url_filter = (lambda url: urlsplit(url).netloc == starting_netloc)
    scrape((GET, start_url), url_filter)
开发者ID:Kasfen,项目名称:networkprogramming,代码行数:7,代码来源:rscrape1.py


示例16: find_pingback_urls

    def find_pingback_urls(self, urls):
        """
        Find the pingback URL for each URLs.
        """
        pingback_urls = {}

        for url in urls:
            try:
                page = urlopen(url)
                headers = page.info()

                server_url = headers.get('X-Pingback')

                if not server_url:
                    content_type = headers.get('Content-Type', '').split(
                        ';')[0].strip().lower()
                    if content_type in ['text/html', 'application/xhtml+xml']:
                        server_url = self.find_pingback_href(
                            page.read(5 * 1024))

                if server_url:
                    server_url_splitted = urlsplit(server_url)
                    if not server_url_splitted.netloc:
                        url_splitted = urlsplit(url)
                        server_url = '%s://%s%s' % (url_splitted.scheme,
                                                    url_splitted.netloc,
                                                    server_url)
                    pingback_urls[url] = server_url
            except IOError:
                pass
        return pingback_urls
开发者ID:Albertzzzz,项目名称:django-blog-zinnia,代码行数:31,代码来源:ping.py


示例17: __init__

    def __init__(self, registry, url="", auth=None, verify=False,
                 api_timeout=None):
        # Registry ip:port
        self.registry = urlsplit(registry).netloc
        # Service url, ip:port
        self.url = url
        # Authentication (user, password) or None. Used by request to do
        # basicauth
        self.auth = auth
        # Timeout for HTTP request
        self.api_timeout = api_timeout

        # Desired scope is the scope needed for the next operation on the
        # registry
        self.desired_scope = ""
        # Scope of the token we have
        self.scope = ""
        # Token used to authenticate
        self.token = ""
        # Boolean to enfore https checks. Used by request
        self.verify = verify

        # If we have no url then token are not required. get_new_token will not
        # be called
        if url:
            split = urlsplit(url)
            # user in url will take precedence over giver username
            if split.username and split.password:
                self.auth = (split.username, split.password)

            self.token_required = True
        else:
            self.token_required = False
开发者ID:pombredanne,项目名称:docker-registry-client,代码行数:33,代码来源:AuthorizationService.py


示例18: hashed_name

 def hashed_name(self, name, content=None, filename=None):
     # `filename` is the name of file to hash if `content` isn't given.
     # `name` is the base name to construct the new hashed filename from.
     parsed_name = urlsplit(unquote(name))
     clean_name = parsed_name.path.strip()
     filename = (filename and urlsplit(unquote(filename)).path.strip()) or clean_name
     opened = content is None
     if opened:
         if not self.exists(filename):
             raise ValueError("The file '%s' could not be found with %r." % (filename, self))
         try:
             content = self.open(filename)
         except IOError:
             # Handle directory paths and fragments
             return name
     try:
         file_hash = self.file_hash(clean_name, content)
     finally:
         if opened:
             content.close()
     path, filename = os.path.split(clean_name)
     root, ext = os.path.splitext(filename)
     if file_hash is not None:
         file_hash = ".%s" % file_hash
     hashed_name = os.path.join(path, "%s%s%s" %
                                (root, file_hash, ext))
     unparsed_name = list(parsed_name)
     unparsed_name[2] = hashed_name
     # Special casing for a @font-face hack, like url(myfont.eot?#iefix")
     # http://www.fontspring.com/blog/the-new-bulletproof-font-face-syntax
     if '?#' in name and not unparsed_name[3]:
         unparsed_name[2] += '?'
     return urlunsplit(unparsed_name)
开发者ID:ArcTanSusan,项目名称:django,代码行数:33,代码来源:storage.py


示例19: sitelinks

    def sitelinks(self, html_page, url):
        """Finds all links in the provided html page"""
        bs = BeautifulSoup(html_page)
        links = set()
        urlpart = urlsplit(url)

        try:
            for anchor in bs.find_all('a'):
                linkpart = list(urlsplit(anchor['href']))
                linkpart[4] = '' #remove the fragment

                if linkpart[0] == '':
                    linkpart[0] = urlpart.scheme

                if linkpart[1] == '':
                    linkpart[1] = urlpart.netloc

                if linkpart[0] == urlpart.scheme and linkpart[1] == urlpart.netloc:
                    if linkpart[2].startswith('/'):
                        links.add(urlunsplit(linkpart))
                    elif linkpart[2] != '':
                        #relative URL.
                        links.add(urljoin(url, linkpart[2]))
        except KeyError:
            pass

        return links
开发者ID:nada-labs,项目名称:sitemap-generator,代码行数:27,代码来源:spider.py


示例20: __init__

    def __init__(
        self,
        url=DEFAULT_URI,
        name=None,
        ssl_required=False,
        verbose=False,
        pedantic=False,
        socket_keepalive=False
    ):
        self._connect_timeout = None
        self._socket_keepalive = socket_keepalive
        self._socket = None
        self._socket_file = None
        self._subscriptions = {}
        self._next_sid = 1
        self._server = None
        self._server_index = 0

        if isinstance(url, (list, tuple)):
            urls = [urlparse.urlsplit(x) for x in url]
        else:
            urls = [urlparse.urlsplit(url)]

        self._options = {
            'url': urls,
            'name': name,
            'ssl_required': ssl_required,
            'verbose': verbose,
            'pedantic': pedantic
        }
开发者ID:bahadir,项目名称:pynats,代码行数:30,代码来源:connection.py



注:本文中的urllib.parse.urlsplit函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python parse.urlunparse函数代码示例发布时间:2022-05-27
下一篇:
Python parse.urlquote_from_bytes函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap