本文整理汇总了Python中urlnorm.norm函数的典型用法代码示例。如果您正苦于以下问题:Python norm函数的具体用法?Python norm怎么用?Python norm使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了norm函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: normalize_url
def normalize_url(base_url, url):
myfile3 = open('normalization_log', 'a')
myfile3.write("base url:{0}\n".format(base_url))
myfile3.write("url:{0}\n".format(url))
myfile3.close()
result = ''
# if url starts with http:// or https://
allowed_scheme = ['http', 'https']
url_scheme = urlparse(url).scheme
if url_scheme in allowed_scheme:
return urlnorm.norm(url)
elif url_scheme == 'mailto':
return False
elif len(url_scheme) == 0:
# check if URL starts with ../
if (url[:3] == '../') or (url[:2] == './'):
return urlnorm.norm(base_url+'/'+url)
elif url[0] == '/': # e.g. /page/page
# That means it's the domain + url
url_obj = urlparse(base_url)
new_url = url_obj.scheme + "://" + url_obj.netloc + url
return urlnorm.norm(new_url)
else: # URL should be just html page e.g. research.html
# so we need to replace the last part
# if URL is 'http://www.test.com/page/page/12345':
# results will be ['http://www.test.com/page/page', '12345']
parts = base_url.rsplit('/', 1)
return urlnorm.norm(parts[0]+'/'+url)
result = url
return result
开发者ID:sigmundc,项目名称:CS6965,代码行数:32,代码来源:crawler3.py
示例2: main
def main():
if (len(sys.argv) < 3 ):
print "usage: python ll-print.py <url> <search term>"
print "example: python ll-print.py http://www.hunch.com 'hunch team'"
exit(0)
root_URL = sys.argv[1]
search_term = sys.argv[2]
if (not validate_search_term(search_term)):
print "Invalid search term. Please only use valid url characters and spaces."
exit(1)
first_letter = search_term[0]
first_letter_match = root_URL.find(first_letter.lower())
if (first_letter_match != -1):
try:
br = mechanize.Browser()
br._factory.is_html = True
result = []
br.open(root_URL)
# print "visiting: " + urlnorm.norm(br.geturl())
visited = set([urlnorm.norm(br.geturl()), urlnorm.norm(root_URL)])
result = find_matching_links(br, search_term, result, visited)
if (result):
max_index = max(result, key=lambda u: u[1])[1]
for l, i, c in result:
print_url(l, i, max_index)
except urlnorm.InvalidUrl:
print "Invalid root URL"
except urllib2.URLError, e:
print "Error opening root URL"
print e
except Exception, e:
print e
开发者ID:rickychang,项目名称:letter-link-crawl,代码行数:32,代码来源:llc.py
示例3: find_matching_links
def find_matching_links(br, target_word, result, visited):
if (not target_word):
return result
else:
current_URL = urlnorm.norm(br.geturl())
current_letter = target_word[0].lower()
if (current_letter.isspace()):
return find_matching_links(br, target_word[1:], result + [('', -1, ' ')], visited)
else:
matching_index = current_URL[7:].find(current_letter)
if (matching_index == -1):
return []
else:
new_result = result + [(current_URL, matching_index + 7, current_letter)]
links = list(br.links())
for link in links:
try:
link_URL = urlnorm.norm(link.absolute_url)
if (link_URL not in visited):
br.open(link_URL)
new_visited = visited.copy()
new_visited.add(link_URL)
# print "visiting: " + urlnorm.norm(br.geturl())
new_visited.add(urlnorm.norm(br.geturl()))
child_result = find_matching_links(br, target_word[1:], new_result, new_visited)
if (child_result):
return child_result
except Exception, e:
continue
开发者ID:rickychang,项目名称:letter-link-crawl,代码行数:29,代码来源:llc.py
示例4: task_listener_crawler
def task_listener_crawler(gearman_worker, gearman_job):
url = gearman_job.data
url_frontier.add(url)
urls = urlparse.urlparse(url)
print "Crawling ", url
response = requests.get(url, crawler_headers)
print 'Downloaded page'
if response.status_code == 200:
raw_data = response.text
if response.encoding != 'utf8':
raw_data = response.text.encode(response.encoding).decode('utf8')
r.table(raw_result_table).insert({'url': url, 'raw': raw_data, 'status': 200}, conflict="replace").run(rethink)
links = linkregex.findall(raw_data)
for link in (links.pop(0) for _ in xrange(len(links))):
pre_norm_url = url_pre_norm(link, urls)
norm_url = urlnorm.norm(pre_norm_url)
norm_parts = urlparse.urlparse(norm_url)
ext_url = norm_parts.path.split(".")[-1].lower()
if ext_url not in except_url_suffixes and url_frontier.add(norm_url):
print "Add ", norm_url, " to redis queue"
redis_client.rpush("urls:enqueued", norm_url)
print "Done"
return "ok"
else:
r.table(raw_result_table).insert({'url': url, 'status': response.status_code}, conflict="replace").run(rethink)
return "fail"
开发者ID:khanhicetea,项目名称:distributed-webcrawler,代码行数:27,代码来源:crawler.py
示例5: __init__
def __init__(self, url, previous=None, **info):
# Apply the simple idempotent optimizations to all urls (no need to
# ever deal with "HTTP://.."). This means case-sensitivity, and a
# whole lot of other things that the urlnorm library will do for us.
# We call this the original url, even though it is a bit of a lie.
try:
self.original_url = urlnorm.norm(url)
except urlnorm.InvalidUrl as e:
raise urlnorm.InvalidUrl('{}: {}'.format(e, url))
# For the normalized url that we'll be exposing, remove the
# fragment, and treat https and http the same.
url, fragment = urldefrag(self.original_url)
self.lossy_url_data = {'fragment': fragment}
if url.startswith('https:'):
url = 'http' + url[5:]
self.lossy_url_data.update({'protocol': 'https'})
self.url = url
self.set_previous(previous)
self.info = info
self.post = None
# Runtime data
self.response = None
self.exception = None
self.retries = 0
开发者ID:miracle2k,项目名称:track0,代码行数:27,代码来源:spider.py
示例6: processPage
def processPage():
while not urls.counter > urlcount:
try:
link = urlpool.get()
newurl = urlparse.urljoin(link.base_url, link.url) # Converting relative URLs to Absolute ones
newurl = unicode(urlnorm.norm(newurl)) # Normalizing URL
print "out: " + newurl
disassembled = urlparse.urlsplit(newurl)
filename, file_ext = splitext(basename(disassembled.path)) # Finding file extension for filtering exclusions
file_ext = file_ext.lower()
if filename == 'index':
newurl = newurl[:-len(filename + file_ext)]
if (file_ext not in excludedExtensions and disassembled.scheme in ['http', 'https'] and disassembled.fragment == ''):
print "in : " + newurl
if newurl not in visited: # Checking to see if URL has already been queued once
visited.add(newurl)
if urlContains(newurl, searchTags) > 0:
urls.put(newurl, 1)
else:
priority = priorityCalculator.searchPage(newurl, searchTags)
if priority < len(searchTags) + 1:
urls.put(newurl, priority) # Adding URL to queue with calculated priority
except UnicodeEncodeError:
print "UnicodeEncodeError"
except:
print "Invalid URL"
开发者ID:Walliee,项目名称:FocusedCrawler,代码行数:26,代码来源:Crawler.py
示例7: canonicalize
def canonicalize(url):
"""Canonicalize a URL in just a few easy steps:
1. Resolve any redirects
2. Normalize the URL
3. Strip any superflous query params
4. Sort any remaining query params
5. Profit!
This relies on the urlnorm module for normalization, and, at the moment,
just removes utm_* query params.
TODO: Special case normalization for major sites (e.g. youtube)?
"""
url = urlnorm.norm(resolve(url))
url_parts = urlparse.urlsplit(url)
scheme, netloc, path, query, fragment = url_parts
params = []
for key, value in cgi.parse_qs(query).iteritems():
if exclude_param(url_parts, key, value):
continue
if isinstance(value, list):
params.extend((key, v) for v in value)
else:
params.append((key, value))
query = urllib.urlencode(sorted(params), doseq=1)
return urlparse.urlunsplit((scheme, netloc, path, query, ''))
开发者ID:MattLeMay,项目名称:thresholderbot,代码行数:29,代码来源:urlwork.py
示例8: test_invalid_urls
def test_invalid_urls(url):
try:
output = urlnorm.norm(url)
print '%r' % output
except urlnorm.InvalidUrl:
return
assert 1 == 0, "this should have raised an InvalidUrl exception"
开发者ID:jehiah,项目名称:urlnorm,代码行数:7,代码来源:test_urlnorm.py
示例9: normalize_url
def normalize_url(url):
# TODO: learn from https://github.com/hypothesis/h/blob/master/h/api/uri.py
try:
norm = urlnorm.norm(url)
norm, _ = urldefrag(norm)
return norm.rstrip('/')
except:
return None
开发者ID:adamchainz,项目名称:aleph,代码行数:8,代码来源:urls.py
示例10: normalize_url
def normalize_url(url):
norm_url = urlnorm.norm(url)
if norm_url.startswith("https://"):
return norm_url[8:]
elif norm_url.startswith("http://"):
return norm_url[7:]
else:
return norm_url
开发者ID:osks,项目名称:komfeeder,代码行数:8,代码来源:feedimporter.py
示例11: googleSearch
def googleSearch ( searchString ):
g = pygoogle(searchString)
g.pages = 2
urls = g.get_urls()
urls = urls[:10]
for i in range(len(urls)):
urls[i]=unicode(urlnorm.norm(urls[i]))
return urls
开发者ID:Walliee,项目名称:FocusedCrawler,代码行数:9,代码来源:gQuery.py
示例12: new
def new(cls, *args, **kwargs):
obj = cls(*args)
obj.source = kwargs['source']
obj.duplicates = 0
obj.priority = 0
# normalize url
if hasattr(obj, 'url'):
obj.url = urlnorm.norm(obj.url)
return obj
开发者ID:axknightroad,项目名称:metasearch,代码行数:9,代码来源:base.py
示例13: normalize_url
def normalize_url(url):
# TODO: learn from https://github.com/hypothesis/h/blob/master/h/api/uri.py
try:
url = urlnorm.norm(url)
url, _ = urldefrag(url)
url = url.rstrip("/")
return url
except:
return None
开发者ID:rlugojr,项目名称:krauler,代码行数:9,代码来源:url.py
示例14: clean
def clean(self):
"""Ensures that URLs are canonized before saving"""
self.value = refang(self.value.strip())
try:
if re.match(r"[^:]+://", self.value) is None: # if no schema is specified, assume http://
self.value = u"http://{}".format(self.value)
self.value = urlnorm.norm(self.value)
except urlnorm.InvalidUrl:
raise ObservableValidationError("Invalid URL: {}".format(self.value))
开发者ID:carriercomm,项目名称:yeti,代码行数:9,代码来源:url.py
示例15: __init__
def __init__(self, url):
"""Construct from a string or Django request."""
nurl = urlnorm.norm(url.encode('utf-16').lower())
if hasattr(nurl, 'get_full_path'):
nurl = nurl.get_full_path()
self.scheme, self.netloc, self.path, self.params, \
self.query, self.fragment = urlparse.urlparse(nurl)
filename, self.ftype = os.path.splitext(self.path)
self.args = dict(cgi.parse_qsl(self.query))
开发者ID:file-citas,项目名称:pyhtoncrawler,代码行数:10,代码来源:url.py
示例16: dl_html
def dl_html(page):
url = "http://en.wiktionary.org/wiki/%s" % page
url = urlnorm.norm(url)
# we should be able to crawl any page from the links we obtained
# and we're obeying crawling delays here
response = urllib2.urlopen(url.encode("utf8"), timeout=5)
time.sleep(config.page_crawl_delay)
return response.read()
开发者ID:Leeyp,项目名称:WiktionaryCrawler,代码行数:10,代码来源:crawler.py
示例17: fetch
def fetch(self, method, endpoint, params):
api_endpoint = norm(self.api_base + endpoint)
content = self.oauth.request(
method,
api_endpoint,
params = params,
headers={'User-Agent':'Semantics3 Python Lib/0.2'}
)
print(content)
return content
开发者ID:abishekk92,项目名称:semantics3-python,代码行数:10,代码来源:semantics3.py
示例18: canonizeurl
def canonizeurl(url):
split = urlsplit(urlnorm.norm(url))
path = split[2].split(" ")[0]
while path.startswith("/.."):
path = path[3:]
while path.endswith("%20"):
path = path[:-3]
# qs = urlencode(sorted(parse_qsl(split.query)))
qs = ""
return urlunsplit((split.scheme, split.netloc, path, qs, ""))
开发者ID:piyushbjadhav,项目名称:pythoncrawler,代码行数:10,代码来源:simpleCrawler.py
示例19: dl_xml
def dl_xml(params):
url = "http://en.wiktionary.org/w/api.php?format=xml"
for key, val in params.iteritems():
url += "&%s=%s" % (key, val)
url = urlnorm.norm(url)
# We're permitted to crawl any page with the API regardless
# of robots.txt since we're using the API
response = urllib2.urlopen(url.encode("utf8"), timeout=5)
time.sleep(config.api_crawl_delay)
return response.read()
开发者ID:Leeyp,项目名称:WiktionaryCrawler,代码行数:12,代码来源:crawler.py
示例20: getImage
def getImage(self,opener,url,data,wait_time):
"""
Directly get an Image using URLLib. Errors Must be handled.
*Optional Parameters*
:param opener: urllib opener to use (use GetPage for setup)
:param url: url address to use
:param data: data to use in request (like that passed to urlencode)
:param wait_time: time to wait for request
"""
return opener.open(urlnorm.norm(url),data,wait_time).read()
开发者ID:asevans48,项目名称:CrawlerAids,代码行数:13,代码来源:GetImage.py
注:本文中的urlnorm.norm函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论