本文整理汇总了Python中urllib.parse.urlsplit函数的典型用法代码示例。如果您正苦于以下问题:Python urlsplit函数的具体用法?Python urlsplit怎么用?Python urlsplit使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了urlsplit函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: _main
def _main():
base_url = sys.argv[1]
soup = bs4.BeautifulSoup(urlopen(base_url), from_encoding="windows-1252")
index_urls = [urljoin(base_url, h3("a")[0]["href"]) for h3 in soup("h3")]
for index_url in index_urls:
try:
resp = urlopen(index_url)
except HTTPError as err:
print(err, err.url, file=sys.stderr)
print("Skipping..", file=sys.stderr)
continue
index_soup = bs4.BeautifulSoup(resp, from_encoding="iso-8859-1")
index_path = urlsplit(index_url).path
index_filepath = os.path.normpath("." + index_path)
try:
os.makedirs(os.path.dirname(index_filepath))
except OSError as e:
if e.errno != errno.EEXIST:
raise e
for issue_url in iter_issue_urls(index_soup):
issue_url = urljoin(index_url, issue_url)
try:
resp = urlopen(issue_url)
except HTTPError as err:
print(err, err.url, file=sys.stderr)
print("Skipping..", file=sys.stderr)
continue
issue_soup = bs4.BeautifulSoup(resp, from_encoding="windows-1252")
issue_path = urlsplit(issue_url).path
issue_filepath = os.path.normpath("." + issue_path)
with open(issue_filepath, "w") as f:
print(klupu.clean_soup(issue_soup), file=f)
with open(index_filepath, "w") as f:
print(klupu.clean_soup(index_soup), file=f)
开发者ID:imclab,项目名称:klupu,代码行数:34,代码来源:fetch.py
示例2: oauth
def oauth(self, req, credentials = None, params = {}):
#NOTE: While flickr supports HTTPS in its oauth endpoints, flickr
#thinks that the HTTPS endpoints are being accessed via HTTP, and thus
#constructs the signature base string accordingly, which
#will hence not match the signature base string generated by
#pyoauth1client. We solve this by replacing HTTPS with HTTP
#when generating the signature base string, and then revert the change
#after the base string is generated. This way the signature
#base string will match the one generated by flickr even though
#we are accessing the endpoints via HTTPS for ADDED SECURITY!!!111one
x = urlsplit(req.url)
if x.scheme == "https":
#Remove the HTTPS Scheme
https = True
x = x._replace(scheme = "http")
req = req._replace(url = urlunsplit(x))
else:
https = False
y = super().oauth(req, credentials, params)
if https:
#Add back the HTTPS scheme
x = urlsplit(y.url)
x = x._replace(scheme = "https")
y = y._replace(url = urlunsplit(x))
return y
开发者ID:pyokagan,项目名称:pyoauth1client,代码行数:25,代码来源:__init__.py
示例3: main
def main(GET):
global mail,error,error_list
parser = argparse.ArgumentParser(description='Scrape a simple site.')
parser.add_argument('url', help='the URL at which to begin')
start_url = parser.parse_args().url
starting_netloc = urlsplit(start_url).netloc
url_filter = (lambda url: urlsplit(url).netloc == starting_netloc)
scrape((GET, start_url), url_filter)
print ("\n\nresult--------------------------------\nerror:%d" %(error))
count = 1;
for url in error_list:
print(url)
print("\n")
for url in mail:
print("[%d]url:%s" %(count,url))
data = mail[url][0]
if data:
tmp = []
for val in data:
if not val in tmp:
print (val)
tmp.append(val)
else:
print("None")
print ("")
count+=1
开发者ID:cheersa,项目名称:python,代码行数:28,代码来源:hw3.py
示例4: __form_data
def __form_data(text, formid, params, soup=None, form_url=None):
if type(params) is not dict:
raise TypeError('Params must be a dict')
if soup is None:
soup = BeautifulSoup(text, 'html.parser')
form = soup.find('form', attrs={'id': formid})
action = form.attrs.get('action')
if not urlsplit(action).netloc:
if form_url is None or not urlsplit(form_url).netloc:
raise ValueError('kwarg form_url must be specified if form '
'action lacks a host')
action = urljoin(form_url, action)
inputs = form.find_all('input') + form.find_all('textarea')
for i in inputs:
try:
name = i.attrs['name']
type_ = i.attrs['type']
value = params.get(name)
if type_ == 'submit':
continue
elif type_ == 'hidden':
value = i.attrs['value'] if value is None else value
elif value is None:
raise ValueError('kwarg params dictionary is missing a '
'value for a non-hidden field')
except KeyError:
pass
else:
params[name] = value
return Session.FormInfo(params=params, post_url=action)
开发者ID:lachm,项目名称:fbbot,代码行数:30,代码来源:infra.py
示例5: clean_url
def clean_url(value):
"""
Taken from Django' URLField, this helps to normalize URLs. Raises a
ValueError if an invalid url is passed.
Example:
>>> clean_url("www.google.com")
"http://www.google.com"
>>> clean_url("_.com")
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
ValueError: Enter a valid URL.
"""
if value:
value = value.strip()
value = value.encode('ascii', 'ignore').decode("utf-8")
url_fields = list(urlsplit((value)))
if not url_fields[0]:
# If no URL scheme given, assume http://
url_fields[0] = 'http'
if not url_fields[1]:
# Assume that if no domain is provided, that the path segment
# contains the domain.
url_fields[1] = url_fields[2]
url_fields[2] = ''
# Rebuild the url_fields list, since the domain segment may now
# contain the path too.
url_fields = list(urlsplit((urlunsplit(url_fields))))
if not url_fields[2]:
# the path portion may need to be added before query params
url_fields[2] = '/'
value = urlunsplit(url_fields)
return value
开发者ID:TrackMaven,项目名称:trackmaven-common,代码行数:35,代码来源:urls.py
示例6: assertRedirects
def assertRedirects(self, response, expected_url, status_code=302,
target_status_code=200, host=None):
"""Asserts that a response redirected to a specific URL, and that the
redirect URL can be loaded.
Note that assertRedirects won't work for external links since it uses
TestClient to do a request.
"""
self.assertEqual(response.status_code, status_code,
("Response didn't redirect as expected: Response code was %d"
" (expected %d)" % (response.status_code, status_code)))
url = response['Location']
scheme, netloc, path, query, fragment = urlsplit(url)
e_scheme, e_netloc, e_path, e_query, e_fragment = urlsplit(expected_url)
if not (e_scheme or e_netloc):
expected_url = urlunsplit(('http', host or 'testserver', e_path,
e_query, e_fragment))
self.assertEqual(url, expected_url,
"Response redirected to '%s', expected '%s'" % (url, expected_url))
# Get the redirection page, using the same client that was used
# to obtain the original response.
redirect_response = response.client.get(path, QueryDict(query))
self.assertEqual(redirect_response.status_code, target_status_code,
("Couldn't retrieve redirection page '%s': response code was %d"
" (expected %d)") %
(path, redirect_response.status_code, target_status_code))
开发者ID:gitdlam,项目名称:geraldo,代码行数:27,代码来源:testcases.py
示例7: parse_url
def parse_url(link):
"""Say Website Title information in channel"""
baseurl = '{uri.scheme}://{uri.netloc}'.format(uri=urlsplit(link))
path = urlsplit(link).path
query = '?{uri.query}'.format(uri=urlsplit(link))
try:
headers = {'Accept-Encoding': 'utf-8',
'User-Agent': 'Mozilla/5.0'}
response = get(baseurl + path + query, headers=headers)
except:
return
if response.headers["Content-Type"] and "text/html" in response.headers["Content-Type"]:
try:
URL = BeautifulSoup(response.text, "html.parser")
except:
return
if not URL.title:
return
if URL.title.string is None:
return
if len(URL.title.string) > 250:
title=URL.title.string[0:250] + '…'
else:
title=URL.title.string
return title.replace('\n', ' ').strip() + " (" + urlsplit(link).netloc + ")"
else:
return
开发者ID:meskarune,项目名称:autobot,代码行数:27,代码来源:url_announce.py
示例8: find_pingback_urls
def find_pingback_urls(self, urls):
"""Find the pingback urls of each urls"""
pingback_urls = {}
for url in urls:
try:
page = urlopen(url)
headers = page.info()
if 'text/' not in headers.get('Content-Type', '').lower():
continue
server_url = headers.get('X-Pingback')
if not server_url:
server_url = self.find_pingback_href(page.read())
if server_url:
server_url_splitted = urlsplit(server_url)
if not server_url_splitted.netloc:
url_splitted = urlsplit(url)
server_url = '%s://%s%s' % (url_splitted.scheme,
url_splitted.netloc,
server_url)
pingback_urls[url] = server_url
except IOError:
pass
return pingback_urls
开发者ID:sergeny,项目名称:django-blog-zinnia,代码行数:27,代码来源:ping.py
示例9: run
def run(self):
while True:
# grabs url from queue
level, u = self.input_q.get()
main = '{0.scheme}://{0.netloc}/'.format(urlsplit(u))
# fetching urls
if level < MAX_URL_LEVEL:
html = _get_content(u)
if not isinstance(html, list):
soup = bs(html)
for link in soup.find_all('a'):
href = link.get('href')
if not href or len(href) < 2:
continue
# Check if URL is relative
elif not urlsplit(href)[0] and not urlsplit(href)[1]:
self.output_q.put((level+1, _url_discard(urljoin(u, href))))
elif href.startswith(main):
self.output_q.put((level+1, _url_discard(href)))
else:
# Place for possible error logs (:
pass
# signals to queue job is done
self.input_q.task_done()
开发者ID:komarovf,项目名称:uwc2015,代码行数:30,代码来源:parser.py
示例10: test_flow
def test_flow(self):
url = self.sp.make_auth_req()
status, headers, _ = self.getPage(url)
assert status == '303 See Other'
url = self.get_redirect_location(headers)
req = parse_qs(urlsplit(url).query)
assert 'SAMLRequest' in req
assert 'RelayState' in req
action, body = self.idp.handle_auth_req(req['SAMLRequest'][0],
req['RelayState'][0],
BINDING_HTTP_REDIRECT,
'test1')
status, headers, body = self.getPage(action, method='POST',
body=urlencode(body))
assert status == '302 Found'
url = self.get_redirect_location(headers)
req = parse_qs(urlsplit(url).query)
assert 'SAMLResponse' in req
assert 'RelayState' in req
resp = self.sp.parse_authn_request_response(req['SAMLResponse'][0],
BINDING_HTTP_REDIRECT)
identity = resp.ava
assert identity["displayName"][0] == "Test1"
assert identity["sn"][0] == "[email protected]"
assert identity['o'][0] == "Small university"
开发者ID:ibrsp,项目名称:s2sproxy,代码行数:28,代码来源:test_proxy_server.py
示例11: _url
def _url(self, hashed_name_func, name, force=False, hashed_files=None):
"""
Return the non-hashed URL in DEBUG mode.
"""
if settings.DEBUG and not force:
hashed_name, fragment = name, ''
else:
clean_name, fragment = urldefrag(name)
if urlsplit(clean_name).path.endswith('/'): # don't hash paths
hashed_name = name
else:
args = (clean_name,)
if hashed_files is not None:
args += (hashed_files,)
hashed_name = hashed_name_func(*args)
final_url = super().url(hashed_name)
# Special casing for a @font-face hack, like url(myfont.eot?#iefix")
# http://www.fontspring.com/blog/the-new-bulletproof-font-face-syntax
query_fragment = '?#' in name # [sic!]
if fragment or query_fragment:
urlparts = list(urlsplit(final_url))
if fragment and not urlparts[4]:
urlparts[4] = fragment
if query_fragment and not urlparts[3]:
urlparts[2] += '?'
final_url = urlunsplit(urlparts)
return unquote(final_url)
开发者ID:Damgaard,项目名称:django,代码行数:30,代码来源:storage.py
示例12: https_open
def https_open(self, request):
"""
Send an HTTP request, which can be either GET or POST,
depending on req.has_data()
Args:
request - instance of urllib2.Request
"""
full_url = request.get_full_url()
url_parts = parse.urlsplit(full_url)
robo = None
if url_parts.netloc in self.robots:
robo = self.robots[url_parts.netloc]
else:
# Getting request url, for checking robots.txt
host = parse.urlsplit(full_url)[1]
rurl = parse.urlunparse(("http", host, "/robots.txt", "", ""))
robo = reppy.cache.RobotsCache()
robo.fetch(rurl, self.agent_name)
self.robots[url_parts.netloc] = robo
# Is url allow for crawler in robots.txt
if robo.allowed(full_url, self.agent_name):
# Return result of request
return request.HTTPHandler.https_open(self, request)
else:
raise RuntimeError('Forbidden by robots.txt')
开发者ID:Armoken,项目名称:Learning,代码行数:27,代码来源:crawler.py
示例13: get_fetcher
def get_fetcher(url=None, *, item=dict()):
RTMP_PROTOCOLS = {'rtmp', 'rtmpt', 'rtmpe', 'rtmpte'}
url = item.get("url", url)
if urlsplit(url).scheme in RTMP_PROTOCOLS:
return RtmpFetcher(url, live=True)
auth = comm.get_auth()
protocol = urlsplit(auth['server']).scheme
if protocol in RTMP_PROTOCOLS:
(url, ext) = url.rsplit('.', 1) # strip the extension (.flv or .mp4)
url = auth['playpath_prefix'] + url
if ext == 'mp4':
url = 'mp4:' + url
rtmp_url = auth['rtmp_url']
token = auth.get('token')
if token:
# Cannot use urljoin() because
# the RTMP scheme would have to be added to its whitelist
rtmp_url += '?auth=' + token
return RtmpFetcher(rtmp_url, playpath=url)
else:
return HdsFetcher(url, auth)
开发者ID:timwhite,项目名称:python-iview,代码行数:26,代码来源:fetch.py
示例14: zoom_article
def zoom_article(self, ticket_id, article_id):
art_descr = self.__db.article_description(article_id)
if art_descr[4] & ART_TEXT:
return eval(self.__db.article_message(article_id))
self.echo("Zoom article:", ticket_id, article_id)
url_beg = urlsplit(self.runtime.get("site"))[:3]
params = (
("Action", "AgentTicketZoom"), ("Subaction", "ArticleUpdate"),
("TicketID", ticket_id), ("ArticleID", article_id),
("OTRSAgentInterface", self.runtime["OTRSAgentInterface"]))
url = urlunsplit(url_beg + (urlencode(params), ""))
pg = TicketsPage(self.core)
page = pg.load(url)
if page is None:
return
mail_header = page.get("mail_header", [])
if "mail_src" in page:
url = urlunsplit(url_beg[:2] + urlsplit(page["mail_src"])[2:])
self.echo("Get message:", url)
pg = MessagePage(self.core)
try:
mail_text = pg.load(url)
except LoginError:
mail_text = pg.login()
else:
mail_text = page["message_text"]
if mail_header:
mail_text.insert(0, ("\n",))
for i in reversed(mail_header):
mail_text.insert(0, ("%s\t%s\n" % i,))
shrink_tupled_text(mail_text)
self.__db.article_message(article_id, repr(mail_text))
return mail_text
开发者ID:Lysovenko,项目名称:OTRS_US,代码行数:33,代码来源:msg_ldr.py
示例15: main
def main(GET):
parser = argparse.ArgumentParser(description='Scrape a simple site.')
parser.add_argument('url', help='the URL at which to begin')
start_url = parser.parse_args().url
starting_netloc = urlsplit(start_url).netloc
url_filter = (lambda url: urlsplit(url).netloc == starting_netloc)
scrape((GET, start_url), url_filter)
开发者ID:Kasfen,项目名称:networkprogramming,代码行数:7,代码来源:rscrape1.py
示例16: find_pingback_urls
def find_pingback_urls(self, urls):
"""
Find the pingback URL for each URLs.
"""
pingback_urls = {}
for url in urls:
try:
page = urlopen(url)
headers = page.info()
server_url = headers.get('X-Pingback')
if not server_url:
content_type = headers.get('Content-Type', '').split(
';')[0].strip().lower()
if content_type in ['text/html', 'application/xhtml+xml']:
server_url = self.find_pingback_href(
page.read(5 * 1024))
if server_url:
server_url_splitted = urlsplit(server_url)
if not server_url_splitted.netloc:
url_splitted = urlsplit(url)
server_url = '%s://%s%s' % (url_splitted.scheme,
url_splitted.netloc,
server_url)
pingback_urls[url] = server_url
except IOError:
pass
return pingback_urls
开发者ID:Albertzzzz,项目名称:django-blog-zinnia,代码行数:31,代码来源:ping.py
示例17: __init__
def __init__(self, registry, url="", auth=None, verify=False,
api_timeout=None):
# Registry ip:port
self.registry = urlsplit(registry).netloc
# Service url, ip:port
self.url = url
# Authentication (user, password) or None. Used by request to do
# basicauth
self.auth = auth
# Timeout for HTTP request
self.api_timeout = api_timeout
# Desired scope is the scope needed for the next operation on the
# registry
self.desired_scope = ""
# Scope of the token we have
self.scope = ""
# Token used to authenticate
self.token = ""
# Boolean to enfore https checks. Used by request
self.verify = verify
# If we have no url then token are not required. get_new_token will not
# be called
if url:
split = urlsplit(url)
# user in url will take precedence over giver username
if split.username and split.password:
self.auth = (split.username, split.password)
self.token_required = True
else:
self.token_required = False
开发者ID:pombredanne,项目名称:docker-registry-client,代码行数:33,代码来源:AuthorizationService.py
示例18: hashed_name
def hashed_name(self, name, content=None, filename=None):
# `filename` is the name of file to hash if `content` isn't given.
# `name` is the base name to construct the new hashed filename from.
parsed_name = urlsplit(unquote(name))
clean_name = parsed_name.path.strip()
filename = (filename and urlsplit(unquote(filename)).path.strip()) or clean_name
opened = content is None
if opened:
if not self.exists(filename):
raise ValueError("The file '%s' could not be found with %r." % (filename, self))
try:
content = self.open(filename)
except IOError:
# Handle directory paths and fragments
return name
try:
file_hash = self.file_hash(clean_name, content)
finally:
if opened:
content.close()
path, filename = os.path.split(clean_name)
root, ext = os.path.splitext(filename)
if file_hash is not None:
file_hash = ".%s" % file_hash
hashed_name = os.path.join(path, "%s%s%s" %
(root, file_hash, ext))
unparsed_name = list(parsed_name)
unparsed_name[2] = hashed_name
# Special casing for a @font-face hack, like url(myfont.eot?#iefix")
# http://www.fontspring.com/blog/the-new-bulletproof-font-face-syntax
if '?#' in name and not unparsed_name[3]:
unparsed_name[2] += '?'
return urlunsplit(unparsed_name)
开发者ID:ArcTanSusan,项目名称:django,代码行数:33,代码来源:storage.py
示例19: sitelinks
def sitelinks(self, html_page, url):
"""Finds all links in the provided html page"""
bs = BeautifulSoup(html_page)
links = set()
urlpart = urlsplit(url)
try:
for anchor in bs.find_all('a'):
linkpart = list(urlsplit(anchor['href']))
linkpart[4] = '' #remove the fragment
if linkpart[0] == '':
linkpart[0] = urlpart.scheme
if linkpart[1] == '':
linkpart[1] = urlpart.netloc
if linkpart[0] == urlpart.scheme and linkpart[1] == urlpart.netloc:
if linkpart[2].startswith('/'):
links.add(urlunsplit(linkpart))
elif linkpart[2] != '':
#relative URL.
links.add(urljoin(url, linkpart[2]))
except KeyError:
pass
return links
开发者ID:nada-labs,项目名称:sitemap-generator,代码行数:27,代码来源:spider.py
示例20: __init__
def __init__(
self,
url=DEFAULT_URI,
name=None,
ssl_required=False,
verbose=False,
pedantic=False,
socket_keepalive=False
):
self._connect_timeout = None
self._socket_keepalive = socket_keepalive
self._socket = None
self._socket_file = None
self._subscriptions = {}
self._next_sid = 1
self._server = None
self._server_index = 0
if isinstance(url, (list, tuple)):
urls = [urlparse.urlsplit(x) for x in url]
else:
urls = [urlparse.urlsplit(url)]
self._options = {
'url': urls,
'name': name,
'ssl_required': ssl_required,
'verbose': verbose,
'pedantic': pedantic
}
开发者ID:bahadir,项目名称:pynats,代码行数:30,代码来源:connection.py
注:本文中的urllib.parse.urlsplit函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论