本文整理汇总了Python中urlparse.urlunsplit函数的典型用法代码示例。如果您正苦于以下问题:Python urlunsplit函数的具体用法?Python urlunsplit怎么用?Python urlunsplit使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了urlunsplit函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: video
def video():
'''
Video request handler.
:return: list of available videos in json format.
'''
entries = []
for entry in os.walk(settings.VIDEO_FILES_PATH):
if not entry[2]: # there is no file
continue
date = os.path.basename(entry[0])
for basename in entry[2]:
filename = os.path.join(entry[0], basename)
relpath = os.path.relpath(filename,
start=settings.VIDEO_FILES_PATH)
parts = list(urlparse.urlsplit(request.base_url)[:2])
parts.append(settings.VIDEO_FILES_LOCATION + '/' + relpath)
parts.extend(['', ''])
url = urlparse.urlunsplit(parts)
parts[2] = settings.THUMBNAIL_FILES_LOCATION + '/'
parts[2] += os.path.splitext(relpath)[0] + '.png'
thumbnail = urlparse.urlunsplit(parts)
entries.append({'date': date, 'url': url, 'thumbnail': thumbnail})
entries.sort(reverse=True, key=lambda x: x['date'])
response = Response()
response.headers['Content-Type'] = 'application/json'
response.data = json.dumps(entries)
return response
开发者ID:weisert,项目名称:mpsb,代码行数:28,代码来源:api.py
示例2: handler
def handler(self, fname, language='text', linenumbers=False, filename=None, site=None, data=None, lang=None, post=None):
"""Create HTML for a listing."""
fname = fname.replace('/', os.sep)
if len(self.folders) == 1:
listings_folder = next(iter(self.folders.keys()))
if fname.startswith(listings_folder):
fpath = os.path.join(fname) # new syntax: specify folder name
else:
# old syntax: don't specify folder name
fpath = os.path.join(listings_folder, fname)
else:
# must be new syntax: specify folder name
fpath = os.path.join(fname)
linenumbers = 'table' if linenumbers else False
deps = [fpath]
with open(fpath, 'r') as inf:
target = urlunsplit(
("link", 'listing', fpath.replace('\\', '/'), '', ''))
src_target = urlunsplit(
("link", 'listing_source', fpath.replace('\\', '/'), '', ''))
src_label = self.site.MESSAGES('Source')
data = inf.read()
lexer = pygments.lexers.get_lexer_by_name(language)
formatter = pygments.formatters.get_formatter_by_name(
'html', linenos=linenumbers)
output = '<a href="{1}">{0}</a> <a href="{3}">({2})</a>' .format(
fname, target, src_label, src_target) + pygments.highlight(data, lexer, formatter)
return output, deps
开发者ID:getnikola,项目名称:nikola,代码行数:30,代码来源:listing.py
示例3: to_python
def to_python(self, value):
def split_url(url):
"""
Returns a list of url parts via ``urlparse.urlsplit`` (or raises a
``ValidationError`` exception for certain).
"""
try:
return list(urlparse.urlsplit(url))
except ValueError:
# urlparse.urlsplit can raise a ValueError with some
# misformatted URLs.
raise ValidationError(self.error_messages["invalid"])
value = super(URLField, self).to_python(value)
if value:
url_fields = split_url(value)
if not url_fields[0]:
# If no URL scheme given, assume http://
url_fields[0] = "http"
if not url_fields[1]:
# Assume that if no domain is provided, that the path segment
# contains the domain.
url_fields[1] = url_fields[2]
url_fields[2] = ""
# Rebuild the url_fields list, since the domain segment may now
# contain the path too.
url_fields = split_url(urlparse.urlunsplit(url_fields))
if not url_fields[2]:
# the path portion may need to be added before query params
url_fields[2] = "/"
value = urlparse.urlunsplit(url_fields)
return value
开发者ID:tovenja,项目名称:django,代码行数:32,代码来源:fields.py
示例4: get_relative_url
def get_relative_url(destination, source):
"""Get relative URL between two sources.
http://stackoverflow.com/a/7469668/315168
:param destination:
:param source:
:return: tuple (is same domain, relative url)
"""
u_dest = urlparse.urlsplit(destination)
u_src = urlparse.urlsplit(source)
_uc1 = urlparse.urlunsplit(u_dest[:2]+tuple('' for i in range(3)))
_uc2 = urlparse.urlunsplit(u_src[:2]+tuple('' for i in range(3)))
if _uc1 != _uc2:
## This is a different domain
return False, destination
# If there is no / component in url assume it's root path
src_path = u_src.path or "/"
_relpath = posixpath.relpath(u_dest.path, posixpath.dirname(src_path))
return True, _relpath
# return True, urlparse.urlunsplit(('', '', _relpath, u_dest.query, u_dest.fragment))
开发者ID:kiok46,项目名称:webkivy,代码行数:27,代码来源:relurl.py
示例5: _generalize_url
def _generalize_url(self, url):
parts = urlsplit(url)
simplified_url = urlunsplit((parts.scheme,
parts.netloc,
'',
'',
''))
url = simplified_url
segments = split_path_into_segments(parts.path)
parent_is_collection = False
for segment in segments:
simplified_url = simplified_url + '/' + (ID_SUBSTITUTE_CHAR if parent_is_collection else segment)
url = url + '/' + segment
if url in self and self._is_a_collection(url):
parent_is_collection = True
else:
parent_is_collection = False
generalized_path = urlsplit(simplified_url).path
return urlunsplit((parts.scheme,
parts.netloc,
generalized_path,
parts.query,
parts.fragment))
开发者ID:01org,项目名称:intelRSD,代码行数:26,代码来源:discovery_container.py
示例6: parse
def parse(self, response):
sel = Selector(response)
# Extract any cars found
cars = sel.xpath('//*[contains(@class, "inv-type-used")]')
for c in cars:
car = Car()
# Title and year
car['title'] = c.xpath('.//div/div/h1/a/text()').extract()[0].strip()
car['year'] = car['title'][0:4]
# Price, but remove non-number characters.
# Examples: '$12,000', 'Please Call', etc.
price = c.xpath('.//*[contains(@class, "value")]/text()').extract()[0]
car['price'] = ''.join(d for d in price if d.isdigit())
# url
path = c.xpath('.//div/div/h1/a/@href').extract()[0]
url = urlparse.urlparse(response.url)
car['url'] = urlparse.urlunsplit([url.scheme, url.netloc, path, None, None])
# Certain specs are frequently missing, so we need to handle
# them with try / except
specs = [
{
'name': 'vin',
'xpath': './/*/dt[text()="VIN:"]/following-sibling::dd/text()'
},
{
'name': 'color',
'xpath': './/*/dt[text()="Exterior Color:"]/following-sibling::dd/text()'
},
{
'name': 'miles',
'xpath': './/*/dt[text()="Mileage:"]/following-sibling::dd/text()'
},
{
'name': 'transmission',
'xpath': './/*/dt[text()="Transmission:"]/following-sibling::dd/text()'
}
]
for s in specs:
try:
car[s['name']] = c.xpath(s['xpath']).extract()[0]
except IndexError:
car[s['name']] = None
yield car
# If there's a next page link, parse it for cars as well
next_links = sel.xpath('//*[@rel="next"]/@href').extract()
if len(next_links) > 0:
query = next_links[0]
url = urlparse.urlparse(response.url)
base = urlparse.urlunsplit([url.scheme, url.netloc, url.path, None, None])
next_url = urlparse.urljoin(base, query)
# Construct url
yield Request(next_url, callback=self.parse)
开发者ID:JeffPaine,项目名称:subaru_search,代码行数:60,代码来源:subaru_spider.py
示例7: _load_uri
def _load_uri(self, base_uri, uri_to_resolve):
"""
Obtain a remote instruction.
Returns the instruction as a python object, along with the resolved uri
"""
resolved_uri = urlparse.urlsplit(urlparse.urljoin(base_uri, uri_to_resolve))
base_scheme = urlparse.urlsplit(base_uri).scheme
if base_scheme is not None and base_scheme != resolved_uri.scheme:
raise SchemeSecurityError("Cannot cross from '%s' to '%s'" % (
base_scheme, resolved_uri.scheme))
try:
if resolved_uri.scheme in ['http', 'https']:
instruction = json.loads(requests.get(resolved_uri).text)
elif resolved_uri.scheme is '':
instruction = json.load(open(urlparse.urlunsplit(resolved_uri)))
else:
raise InvalidInstructionError("Reference to unsupported scheme '%s'" % (
resolved_uri.scheme))
return instruction, urlparse.urlunsplit(resolved_uri)
except requests.exceptions.RequestException as e:
raise InvalidInstructionError("Couldn't load '%s': %s" % (resolved_uri, e))
except IOError as e:
raise InvalidInstructionError("Couldn't open '%s': %s" % (resolved_uri, e))
except ValueError:
raise InvalidInstructionError("Invalid JSON in '%s'" % resolved_uri)
开发者ID:pombredanne,项目名称:pycaustic,代码行数:27,代码来源:scraper.py
示例8: from_url
def from_url(url, headers=None, allowed=None):
if headers is None:
headers = {}
result = urlparse.urlsplit(url)
if result.scheme == 'qpid':
# remove the queue from the url
queue, query = extract_param('queue', result.query)
if queue is None:
raise ApplicationException('No queue provided in qpid url!')
new_url = urlparse.urlunsplit((result.scheme, result.netloc, result.path,
query, result.fragment))
return QpidPublisher(new_url, queue, headers, allowed)
elif result.scheme == 'rabbit':
queue, query = extract_param('queue', result.query)
if queue is None:
raise ApplicationException('No queue provided in qpid url!')
new_url = urlparse.urlunsplit(('amqp', result.netloc, result.path,
query, result.fragment))
return RabbitPublisher(new_url, queue, headers, allowed)
elif result.scheme == 'log':
return LogPublisher(allowed)
elif result.scheme == 'count':
return CountPublisher(allowed)
开发者ID:cwingard,项目名称:mi-instrument,代码行数:31,代码来源:publisher.py
示例9: requestData
def requestData(self, basepath):
self.log.info("Attempting to communicate with Nexus server.")
auth = "Basic " + base64.b64encode(self.user + ':' + self.pasw)
deppath = self.url[2] + basepath
delpath = deppath + '/artifactorymigrator'
runpath = delpath + '/run'
depurl = urlparse.urlunsplit((self.url[0], self.url[1], deppath, '', ''))
delurl = urlparse.urlunsplit((self.url[0], self.url[1], delpath, '', ''))
runurl = urlparse.urlunsplit((self.url[0], self.url[1], runpath, '', ''))
delheaders = {'User-Agent': 'nex2art', 'Authorization': auth}
depheaders, runheaders = delheaders.copy(), delheaders.copy()
depheaders['Content-Type'] = 'application/json'
runheaders['Content-Type'] = 'text/plain'
depjson = {'name': 'artifactorymigrator', 'type': 'groovy'}
depjson['content'] = pkgutil.get_data('nex2art', 'resources/plugin.groovy')
depbody = json.dumps(depjson)
res, data = None, None
self.log.info("Deploying extraction plugin to Nexus.")
ex, _ = self.dorequest(depurl, depbody, depheaders, 'POST', "deploy")
if ex == None:
try:
self.log.info("Executing Nexus extraction.")
ex, res = self.dorequest(runurl, None, runheaders, 'POST', "execute", True)
finally:
self.log.info("Deleting extraction plugin from Nexus.")
self.dorequest(delurl, None, delheaders, 'DELETE', "delete")
if res != None and 'result' in res: data = json.loads(res['result'])
if ex != None:
self.log.error("Error accessing Nexus instance: %s", ex)
return "Error accessing Nexus instance."
self.log.info("Successfully fetched Nexus data.")
return data
开发者ID:JFrogDev,项目名称:nexus2artifactory,代码行数:32,代码来源:Nexus3.py
示例10: resolve_links
def resolve_links(self, links, pageurl):
for x in links:
p = urlparse.urlsplit(x)
if p.scheme == "http":
if p.netloc != self.hostname:
# Remote link
continue
# Turn this into a host-relative url
p = ('', '', p.path, p.query, '')
if p[4] != "" or p[3] != "":
# Remove fragments (part of the url past #)
p = (p[0], p[1], p[2], '', '')
if p[0] == "":
if p[2] == "":
# Nothing in the path, so it's a pure fragment url
continue
if p[2][0] == "/":
# Absolute link on this host, so just return it
yield urlparse.urlunsplit(p)
else:
# Relative link
yield urlparse.urljoin(pageurl, urlparse.urlunsplit(p))
else:
# Ignore unknown url schemes like mailto
pass
开发者ID:a1exsh,项目名称:pgweb,代码行数:28,代码来源:basecrawler.py
示例11: _split_uri
def _split_uri(self, identifier):
if isinstance(identifier, URIRef):
scheme, netloc, path, query, fragment = urlsplit(identifier)
if query:
namespace, resource_id = split_uri(identifier)
if fragment:
# if we have a fragment, we will split there
namespace, resource_id = urldefrag(identifier)
namespace += "#"
elif "/" in path and len(path) > 1:
splits = path.split("/")
if path.endswith("/"):
resource_id = "/".join(splits[-2:])
path = "/".join(splits[:-2]) + "/"
namespace = urlunsplit((scheme, netloc, path, "", ""))
else:
resource_id = "/".join(splits[-1:])
path = "/".join(splits[:-1]) + "/"
namespace = urlunsplit((scheme, netloc, path, "", ""))
elif path:
resource_id = path
namespace = urlunsplit((scheme, netloc, "", "", ""))
else:
namespace, resource_id = split_uri(identifier)
log.debug("Split %s to %s, %s" % (identifier, namespace, resource_id))
return namespace, resource_id
else:
raise ValueError("Unknown identifier type %r" % identifier)
开发者ID:handloomweaver,项目名称:agamemnon,代码行数:29,代码来源:rdf_store.py
示例12: rewrite_urls
def rewrite_urls(origin_url, urls):
origin_pack = urlparse.urlsplit(origin_url)
for u in urls:
# kill breaks
if u:
u = re.sub("(\n|\t)", "", u)
pack = urlparse.urlsplit(u)
(scheme, netloc, path, query, fragment) = pack
# try to rewrite scheme
scheme = rewrite_scheme(pack.scheme)
# rewrite netloc to include credentials
if origin_pack.username and pack.hostname == origin_pack.hostname:
netloc = assemble_netloc(origin_pack.username,\
origin_pack.password, pack.hostname, pack.port)
# reassemble into url
new_u = urlparse.urlunsplit((scheme, netloc, path, query, None))
# no scheme or netloc, it's a path on-site
if not scheme and not netloc and (path or query):
path_query = urlparse.urlunsplit((None, None, path, query, None))
new_u = urlparse.urljoin(origin_url, path_query)
# quote spaces
new_u = new_u.replace(" ", "%20")
if new_u:
yield new_u
开发者ID:numerodix,项目名称:qontexte,代码行数:30,代码来源:urlrewrite.py
示例13: verify_image
def verify_image(self, baseURL, imageURL):
fullImageURL = imageURL
if not urlsplit(imageURL).scheme:
# Resolve relative path
fullImageURL = urljoin(baseURL, imageURL)
echo("Checking image: {}".format(fullImageURL))
urlparts = urlsplit(fullImageURL)
escapedparts = self.get_escaped_address_parts_minus_host(urlparts)
if urlparts.netloc and urlparts.path:
try:
conn = httplib.HTTPConnection(urlparts.netloc)
conn.request("HEAD", urlunsplit(escapedparts))
echo("Going to path: {}\n".format(urlunsplit(escapedparts)))
res = conn.getresponse()
except Exception as inst:
self.fail("While checking image {}, encountered exception: {}".format(
fullImageURL, inst))
self.assertEqual(res.status, 200,
'The image at {} is not OK. Looking for it resulted in HTTP code: {}'.format(
urlunsplit([urlparts.scheme, urlparts.netloc, escapedparts[2],
escapedparts[3], escapedparts[4]]),
res.status))
else:
self.fail("The URL for this image is invalid: {}".format(fullImageURL))
开发者ID:Digidai,项目名称:Harvard-Mobile-Web,代码行数:27,代码来源:check_modules.py
示例14: get_onedrive_embed_code
def get_onedrive_embed_code(self, onedrive_url):
onedrive_url = onedrive_url.strip()
# check if it already is an embed code
embed_code_regex = '<iframe'
matched = re.match(embed_code_regex, onedrive_url, re.IGNORECASE)
if matched is not None:
return onedrive_url
scheme, netloc, path, query_string, fragment = urlsplit(onedrive_url)
query_params = parse_qs(query_string)
# OneDrive for Business
odb_regex = 'https?:\/\/((\w|-)+)-my.sharepoint.com\/'
matched = re.match(odb_regex, onedrive_url, re.IGNORECASE)
if matched is not None:
query_params['action'] = ['embedview']
new_query_string = urlencode(query_params, doseq=True)
document_url = urlunsplit((scheme, netloc, path, new_query_string, fragment))
return self.EMBED_CODE_TEMPLATE.format(document_url)
# OneDrive (for consumers)
onedrive_regex = '(https?:\/\/(onedrive\.)?)(live\.com)'
matched = re.match(onedrive_regex, onedrive_url, re.IGNORECASE)
if matched is not None:
new_path = path.replace('view.aspx', 'embed').replace('redir', 'embed')
query_params = parse_qs(query_string)
query_params['em'] = ['2']
new_query_string = urlencode(query_params, doseq=True)
document_url = urlunsplit((scheme, netloc, new_path, new_query_string, fragment))
return self.EMBED_CODE_TEMPLATE.format(document_url)
开发者ID:introp-software,项目名称:xblock-onedrive,代码行数:35,代码来源:onedrive.py
示例15: normalize_url
def normalize_url(url, domain_canonical=None):
"""
Ensure we have a value url - raise exception if not.
If given, we convert the domain to a domain_canonical
"""
url = url.strip()
rgURL = list(urlparse.urlsplit(url))
if rgURL[split.scheme] == '':
url = r"http://%s" % url
rgURL = list(urlparse.urlsplit(url))
# Invalid protocol
if rgURL[split.scheme] != "http" and rgURL[split.scheme] != "https":
raise reqfilter.Error("Invalid protocol: %s" % rgURL[split.scheme])
if domain_canonical is not None:
rgURL[split.domain] = domain_canonical
if rgURL[split.domain]:
rgURL[split.domain] = rgURL[split.domain].lower()
if not rgURL[split.domain] or not regDomain.search(rgURL[split.domain]) or len(rgURL[split.domain]) > 255:
raise reqfilter.Error("Invalid URL: %s" % urlparse.urlunsplit(rgURL))
# Always end naked domains with a trailing slash as canonical
if rgURL[split.path] == '':
rgURL[split.path] = '/'
return urlparse.urlunsplit(rgURL)
开发者ID:BenFort,项目名称:startpad,代码行数:30,代码来源:util.py
示例16: login
def login(self):
"""
Set a cookie and redirect to the url that we tried to
authenticate against originally.
FIXME - I don't think we need this any more now that the EULA is gone -EAD
"""
request = self.REQUEST
response = request['RESPONSE']
login = request.get('__ac_name', '')
password = request.get('__ac_password', '')
submitted = request.get('submitted', '')
pas_instance = self._getPAS()
if pas_instance is not None:
try:
pas_instance.updateCredentials(request, response, login, password)
except (KeyError, POSKeyError):
# see defect ZEN-2942 If the time changes while the server is running
# set the session database to a sane state.
ts = self.unrestrictedTraverse('/temp_folder/session_data')
ts._reset()
_originalResetCredentials(self, request, response)
came_from = request.form.get('came_from') or ''
if came_from:
parts = urlparse.urlsplit(came_from)
querydict = parse_qs(parts[3])
querydict.pop('terms', None)
if 'submitted' not in querydict.keys():
querydict['submitted'] = submitted
newqs = urllib.urlencode(querydict, doseq=True)
parts = parts[:3] + (newqs,) + parts[4:]
came_from = urlparse.urlunsplit(parts)
else:
submittedQs = 'submitted=%s' % submitted
came_from = '/zport/dmd?%s' % submittedQs
if not self.dmd.acceptedTerms:
url = "%s/zenoss_terms/?came_from=%s" % (
self.absolute_url(), urllib.quote(came_from))
else:
# get rid of host part of URL (prevents open redirect attacks)
clean_url = ['', ''] + list(urlparse.urlsplit(came_from))[2:]
url = urlparse.urlunsplit(clean_url)
fragment = request.get('fragment', '')
if fragment:
fragment = urllib.unquote( fragment)
if not fragment.startswith( '#'):
fragment = '#' + fragment
url += fragment
if self.dmd.uuid is None:
self.dmd.uuid = str(uuid1())
return response.redirect(url)
开发者ID:damilare,项目名称:zenoss-prodbin,代码行数:60,代码来源:pasmonkey.py
示例17: get
def get(self):
articles = models.Article.all().order('-pubdate').fetch(9)
items = []
mostRecentDate = None
url_parts = list(urlparse.urlsplit(self.request.url)[0:2])
for article in articles:
if not mostRecentDate:
mostRecentDate = article.pubdate
article.rimages = [db.get(image) for image in article.images]
url = urlparse.urlunsplit(url_parts + ['/page/%s' % article.slug, '', ''])
items.append(
PyRSS2Gen.RSSItem(
title = article.title,
link = url,
description = article.text,
pubDate = article.pubdate))
rss = PyRSS2Gen.RSS2(
title = "RSS feed",
link = urlparse.urlunsplit(url_parts + ['', '', '']),
description = "My RSS feed",
lastBuildDate = mostRecentDate,
items = items,
)
self.response.headers['Content-Type'] = 'text/xml'
self.response.out.write(rss.to_xml())
开发者ID:ghber,项目名称:cygnuscms,代码行数:25,代码来源:main.py
示例18: key
def key(self, obj):
"""Return a cache key (relative path to file in cache) for an object"""
if isnumpy(obj):
# Key is byte view sha1 hash with .h5 extension
byteview = obj.view(numpy.uint8)
key = str(hashlib.sha1(byteview).hexdigest()) + '.h5'
elif isurl(obj):
# key is URL filename with an appended hash (for uniqueness)
p = urlparse.urlsplit(obj)
urlquery = urlparse.urlunsplit([p[0],p[1],p[2],p[3],None])
urlpath = urlparse.urlunsplit([p[0],p[1],p[2],None,None])
urlhash = self._hash(obj)
(filename, ext) = splitextension(path.basename(urlpath))
key = str(urlhash) + str(ext)
elif os.path.isfile(obj):
# within cache?
filebase = obj.split(self.root(),1)
if len(filebase) == 2:
# key is subpath within cache
key = filebase[1][1:]
else:
# key is filename with unique appended hash
(head, tail) = os.path.split(obj)
(filename, ext) = splitextension(tail)
namehash = hashlib.sha1(tail).hexdigest()
key = filename + '_' + str(namehash[0:7]) + ext
elif (path.isfile(self.abspath(obj)) or path.isdir(self.abspath(obj))):
key = obj # Already a cache key
elif isstring(obj):
key = obj # Use arbitrary string if not file or url
else:
raise CacheError('[bobo.cache][ERROR]: Unsupported object for constructing key')
return key
开发者ID:jethrotan,项目名称:bobo,代码行数:33,代码来源:cache.py
示例19: __init__
def __init__(self, baseUri, headers=None, maxClients=None,
maxConnections=None):
self._headers = headers or HTTPHeaders()
self._user = None
self._passwd = None
baseUri = baseUri.rstrip('/')
self._scheme, loc, self._path, query, frag = urlparse.urlsplit(baseUri)
userpass, self._hostport = urllib.splituser(loc)
if userpass:
self._user, self._passwd = urllib.splitpasswd(userpass)
self._baseUri = urlparse.urlunsplit((self._scheme, self._hostport,
self._path, None, None))
if self._scheme not in ('http', 'https'):
raise ValueError(self._scheme)
self._dispatcher = RequestDispatcher(maxClients=maxClients,
maxConnections=maxConnections)
self._queryFragment = urlparse.urlunsplit(('', '', '', query, frag))
开发者ID:pombreda,项目名称:robj,代码行数:25,代码来源:client.py
示例20: _get_robotparser
def _get_robotparser(self, link):
"""Return the proper robots parser for the given url or None if one
cannot be constructed. Robot parsers are cached per scheme and
netloc."""
# only some schemes have a meaningful robots.txt file
if link.scheme != 'http' and link.scheme != 'https':
debugio.debug('crawler._get_robotparser() called with unsupported scheme (%s)' % link.scheme)
return None
# split out the key part of the url
location = urlparse.urlunsplit((link.scheme, link.netloc, '', '', ''))
# try to create a new robotparser if we don't already have one
if not self._robotparsers.has_key(location):
import httplib
debugio.info(' getting robots.txt for %s' % location)
self._robotparsers[location] = None
try:
rp = robotparser.RobotFileParser()
rp.set_url(urlparse.urlunsplit(
(link.scheme, link.netloc, '/robots.txt', '', '') ))
rp.read()
self._robotparsers[location] = rp
except (TypeError, IOError, httplib.HTTPException):
# ignore any problems setting up robot parser
pass
return self._robotparsers[location]
开发者ID:BackupTheBerlios,项目名称:rheinaufcms-svn,代码行数:25,代码来源:crawler.py
注:本文中的urlparse.urlunsplit函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论