本文整理汇总了Python中tld.get_tld函数的典型用法代码示例。如果您正苦于以下问题:Python get_tld函数的具体用法?Python get_tld怎么用?Python get_tld使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了get_tld函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: _is_valid_link
def _is_valid_link(self, link):
"""
Return True if given link is non document, Since this is not a perfect way to check
but it avoids a call to server.
"""
# Check ONLY_ROOTDOMAIN
scheme, netloc, path, params, query, fragment = urlparse(link)
try:
if get_tld(self.base_url) == get_tld(link) and not ONLY_ROOTDOMAIN:
# if get_tld(self.base_url) == get_tld(link):
return False
except Exception as e:
log.error(str(e), self.base_url, link)
# Need to add more
DOC_EXT = [".pdf", ".xmls", ".docx", ".odt"]
try:
urlPath = [i for i in (path.split('/')) if i]
file_name = urlPath[-1]
ext = file_name.split(".")[-1]
except IndexError:
# Its just a root URL
return True
return ext not in DOC_EXT
开发者ID:muke5hy,项目名称:macaw,代码行数:31,代码来源:parser.py
示例2: get_sources_sites
def get_sources_sites(html, sites):
""" (str, list of str) -> list of [str, str]
Searches and returns links redirected to sites within the html
links will be storing the whole url and the domain name used for searching.
Returns empty list if none found
Keyword arguments:
html -- string of html
sites -- list of site urls to look for
"""
result_urls_matched = []
result_urls_unmatched = []
# Format the site to assure only the domain name for searching
formatted_sites = []
for site in sites:
formatted_sites.append(tld.get_tld(site))
for url in re.findall(
"href=[\"\'][^\"\']*?.*?[^\"\']*?[\"\']", html, re.IGNORECASE):
try:
domain = tld.get_tld(url[6:-1])
except (KeyboardInterrupt, SystemExit):
raise
except:
continue
if domain in formatted_sites:
# If it matches even once, append the site to the list
result_urls_matched.append([url[6:-1], domain])
else:
result_urls_unmatched.append([url[6:-1], domain])
# Return the list
return [result_urls_matched,result_urls_unmatched]
开发者ID:gitbronak,项目名称:Voyage,代码行数:34,代码来源:article_explorer.py
示例3: get_source_sites
def get_source_sites(urls, sites):
""" (list of urls, list of source site urls)
Return a list of expanded urls found in source urls,
and a list of expanded urls not found in srouce urls.
"""
result_urls_matched = []
result_urls_unmatched = []
formatted_source_sites = []
for site in sites:
formatted_source_sites.append(tld.get_tld(site))
for url in urls:
try:
# with eventlet, the request will time out in 10s
with eventlet.Timeout(10):
real_url = requests.get(url, timeout=10).url
domain = tld.get_tld(real_url)
except:
continue
if domain in formatted_source_sites:
# If it matches even once, append the site to the list
result_urls_matched.append([real_url, domain])
else:
result_urls_unmatched.append([real_url, domain])
return [result_urls_matched,result_urls_unmatched]
开发者ID:UTMediaCAT,项目名称:Voyage,代码行数:25,代码来源:twitter_crawler.py
示例4: get_sources_sites
def get_sources_sites(article, sites):
""" (str, list of str) -> list of [str, str]
Searches and returns links redirected to sites within the html
links will be storing the whole url and the domain name used for searching.
Returns empty list if none found
Keyword arguments:
html -- string of html
sites -- list of site urls to look for
"""
result_urls_matched = []
result_urls_unmatched = []
# Format the site to assure only the domain name for searching
formatted_sites = set()
for site in sites:
formatted_sites.add(tld.get_tld(site))
for url in article.get_links(article_text_links_only=True):
try:
domain = tld.get_tld(url.href)
#apparently they don't inherit a common class so I have to hard code it
except (tld.exceptions.TldBadUrl, tld.exceptions.TldDomainNotFound, tld.exceptions.TldIOError):
continue
if domain in formatted_sites:
# If it matches even once, append the site to the list
result_urls_matched.append([url.href, domain, url.text])
else:
result_urls_unmatched.append([url.href, domain, url.text])
# Return the list
return [result_urls_matched,result_urls_unmatched]
开发者ID:zhouwein,项目名称:Voyage,代码行数:32,代码来源:article_explorer.py
示例5: current_site_domain
def current_site_domain(request=None):
try:
if request:
from tld import get_tld
domain = get_tld('http://' + request.get_host())
else:
domain = settings.SUBDOMAIN_BASE_DOMAIN
except Exception:
from django.contrib.sites.models import Site
try:
if request:
d = Site.objects.get_current(request=request).domain
else:
d = Site.objects.first().domain
if d[0:4] != 'http':
d = 'http://' + d
domain = get_tld(d)
except Exception:
try:
domain = settings.SUBDOMAIN_BASE_DOMAIN
except Exception:
domain = Site.objects.first().domain
prefix = 'www.'
if getattr(settings, 'REMOVE_WWW_FROM_DOMAIN', False) \
and domain.startswith(prefix):
domain = domain.replace(prefix, '', 1)
return domain
开发者ID:marcfro,项目名称:django-subdomains,代码行数:29,代码来源:utils.py
示例6: analyze_url
def analyze_url(url):
"""
Look just at the URL to see if a suitable title text can be found. This
method is much faster than actually visiting the URL to find the title
element in the downloaded file. We want to do this for special sites like
Facebook, which doesn't allow anonymous downloading of certain pages, like
group pages.
Args:
url: A string that is a URL
Returns:
A string that is the title text to be used. If no suitable title text
can be produced, return the empty string, "".
"""
try:
tl = get_tld(url)
except (tld.exceptions.TldBadUrl, tld.exceptions.TldDomainNotFound):
logging.debug("bad TLD; trying with http:// prefix")
try:
tl = get_tld("http://" + url)
except (tld.exceptions.TldBadUrl, tld.exceptions.TldDomainNotFound):
logging.debug("still bad TLD; giving up")
return ""
if tl == "facebook.com" and "facebook.com/groups/" in url:
return "Facebook group page post"
return ""
开发者ID:riceissa,项目名称:autolink,代码行数:27,代码来源:autolink-legacy.py
示例7: valid_domain
def valid_domain(domain):
"""This function return True if the passed domain is valid, false otherwise"""
try:
get_tld(domain,fix_protocol=True)
return True
except:
return False
开发者ID:matteomattei,项目名称:servermaintenance,代码行数:7,代码来源:lemp_manager.py
示例8: get_source_sites
def get_source_sites(urls, sites):
""" (status, list of str) -> list of str
Searches and returns links redirected to sites within the urls
of the tweet
Returns empty list if none found
Keyword arguments:
tweet -- Status structure to be searched through
sites -- List of site urls to look for
"""
# store_all = configuration()['storage']['store_all_sources']
result_urls_matched = []
result_urls_unmatched = []
formatted_sites = []
for site in sites:
formatted_sites.append(tld.get_tld(site))
for url in urls:
try:
real_url = requests.get(url["expanded_url"], timeout=10).url
domain = tld.get_tld(real_url)
except:
continue
if domain in formatted_sites:
# If it matches even once, append the site to the list
result_urls_matched.append([real_url, domain])
else:
result_urls_unmatched.append([real_url, domain])
# Return the list
return [result_urls_matched, result_urls_unmatched]
开发者ID:gitbronak,项目名称:Voyage,代码行数:33,代码来源:twitter_explorer.py
示例9: parse
def parse(self, response):
domain = get_tld(response.url)
items = []
url_result = urlparse(response.url)
top_domain = url_result.scheme + '://'+url_result.netloc
for sel in response.xpath('//a/@href'):
item = LinkscrawlItem()
link = sel.extract()
if link.find("http://") == 0 or link.find("https://") == 0 or link.find("www.") == 0:
try:
target_domain = get_tld(link)
#print domain +"==================="+target_domain +"==================" + link
if domain != target_domain:
item['link'] = link
item['source'] = top_domain
yield item
#items.append(item)
else:
yield scrapy.Request(link,callback=self.parse)
except:
print "The url can't get the domain. Ignored..." + link
if link.startswith('/'):
yield scrapy.Request(top_domain+link, callback=self.parse)
开发者ID:wonderfan,项目名称:python,代码行数:25,代码来源:LinkSpider.py
示例10: crawl
def crawl(n):
i=0
seed = []
db = MySQLdb.connect(host='cspp53001.cs.uchicago.edu',db='jcbraunDB',user='jcbraun',passwd='3312crystal')
cursor = db.cursor()
outLinks = []
if (n ==0):
execString = ("SELECT URL, Domain FROM safeSeed WHERE crawled=0;")
cursor.execute(execString)
seedx = cursor.fetchall()
else:
execString = ("SELECT URLTo FROM safeOutboundLinks WHERE lvl=%i;" % (n))
cursor.execute(execString)
seedx = cursor.fetchall()
print seedx
for row in seedx:
i = i+1
try:
url = row[0]
print url
domain = get_tld(url, fail_silently=True)
content = urllib2.urlopen(url, timeout=3).read(2000000)
for k in re.findall('''href=["'](.[^"']+)["']''', content):
z = ((re.match('http://' , k) is not None) or (re.match('//' , k) is not None))
y = re.match('/' , k)
if (y):
k = (("/").join((re.split("/", url)))+k)
if z or y:
domainTo = (get_tld(k, fail_silently=True))
print "domainTo is: %s" %domainTo
reqURL = "https://sb-ssl.google.com/safebrowsing/api/lookup?client=f4p&key=AIzaSyCD0pNAG-6HVh_W6udGYZFz-2_p0yHDD5k&appver=31&pver=3.1&url=" + k
response = urllib2.urlopen(reqURL).getcode()
if (response==200):
print ("Found dangerous site \n")
bad = 1
execString = ("INSERT INTO inboundLinks (Domain, domainTo, URL, URLto, Crawled) VALUES ('%s', '%s', '%s', '%s', 'false');" % (domain, domainTo, url, k))
cursor.execute(execString)
else:
bad = 0
execString = ("INSERT INTO safeOutboundLinks (Lvl, Domain, domainTo, URL, URLto, Crawled, toSpam) VALUES ('%i', '%s', '%s', '%s', '%s', '0', '%i');" % ((n+1), domain, domainTo, url, k, bad))
cursor.execute(execString)
print("adding %s" %k)
db.commit()
bank = open('notspam/%d.txt' %i, 'w')
bank.write (content)
content=db.escape_string(content)
execString = ("INSERT INTO safeContent (Lvl, Content, Domain, URL, CopySource) VALUES ('%i', '%s', '%s', '%s', 'crawl');" % ((n+1), content, domain, url))
cursor.execute(execString)
print url + " success! \n"
bank.close()
except Exception as e:
print ("Broken link to %s" %url)
print (type(e))
print (e.args)
db.commit()
db.close()
开发者ID:0x0mar,项目名称:Fishing-for-Phishing,代码行数:59,代码来源:safeCrawl.py
示例11: _from_same_site
def _from_same_site(self, ads_host, ads_target):
if ads_target is None:
return True
if not ads_target.startswith("http"):
return True
ads_host_domain = get_tld(ads_host, as_object=True).domain
ads_target_domain = get_tld(ads_target, as_object=True).domain
return True if ads_host_domain == ads_target_domain else False
开发者ID:yuanbei,项目名称:adspider,代码行数:8,代码来源:ads_profile_spider.py
示例12: valid_a_href
def valid_a_href(a_elements, main_url=None):
hrefs = [a.get('href') for a in a_elements]
hrefs = [link for link in hrefs if link and link.startswith('http://')]
if main_url:
main_tld = get_tld(main_url, fail_silently=True)
hrefs = [link for link in hrefs if get_tld(link, fail_silently=True) == main_tld]
return hrefs
开发者ID:shenxiangq,项目名称:news_crawler,代码行数:8,代码来源:util.py
示例13: get_link_text
def get_link_text(url, mime_type, data=None):
'''
Take URL, MIME type, and optional data to produce the link text.
'''
tld = get_tld(url)
result = "File on " + tld
if mime_type.startswith("image"):
result = "Image on " + tld
elif mime_type == "application/pdf":
result = "PDF on " + tld
elif "text/html" in mime_type:
try:
soup = BeautifulSoup(data, 'html.parser')
meta = soup.find_all("meta")
possible = [i.get("content") for i in meta if i.get("property") == "og:title"]
if possible:
result = possible[0].strip()
elif soup.title.string:
result = messy_title_parse(soup.title.string)
else:
result = "Page on " + tld
except AttributeError:
# Probably just empty title when trying to get
# soup.title.string
result = "Page on " + tld
if len(result) > 255:
result = result[:253] + " …"
return result
开发者ID:riceissa,项目名称:static-site-generator,代码行数:29,代码来源:autolink.py
示例14: get_monthly_archive_urls
def get_monthly_archive_urls(links, page_url):
'''Scans the provided links for blogspot style archives which are of
the form website.com/yyyy_dd_01_archive.html'''
domain = get_tld(page_url)
monthly_archive_urls = []
for link in links:
# Try for drop down lists using <option value="url.com/...">
try:
url = link.attrs['value']
match = re.search(domain + "/\d{4}_\d{2}_01_archive.html", url)
if match:
monthly_archive_urls.append(url)
except KeyError:
pass
# Try for actual <a href="url.com/..." > links
try:
url = link.attrs['href']
match = re.search(domain + "/\d{4}_\d{2}_01_archive.html", url)
if match:
monthly_archive_urls.append(url)
except KeyError:
pass
return list(set(monthly_archive_urls))
开发者ID:malnoxon,项目名称:rsstory,代码行数:27,代码来源:blogspot.py
示例15: bulk_insert_urls
def bulk_insert_urls(self, content):
logger.debug("in bulk insert urls")
for line in content:
items = line.split()
if len(items) < 9:
logger.error("error parsing line")
logger.error(line)
else:
if ("http" in items[8] and "//" in items[8]):
parts = items[8].split("//")[1].split("/")
domain = parts[0]
res = get_tld(items[8], as_object=True, fail_silently=True)
if res is not None:
tld = "%s.%s" % (res.domain, res.suffix)
else:
tld = parts[0]
path = ""
if len(parts) > 0:
path = "".join(parts[1:])
url = {'ts':items[2].split(".")[0], 'host':items[4], 'tld':tld, 'domain':domain, 'path': path}
try:
logger.debug("inserting %s %s %s" % (url['ts'], url['host'], url['tld']))
self.conn.execute("INSERT INTO URLS(ts, host, tld, domain, path, datasource) VALUES(?,?,?,?,?,?)", (url['ts'], url['host'],url['tld'], url['domain'], url['path'], 'squid'))
except Exception, e:
logger.error("error inserting url %s" % str(url))
开发者ID:ucn-eu,项目名称:ucnviz,代码行数:31,代码来源:datadb.py
示例16: scrap
def scrap(self, url):
self.external_domain = "http://"+get_tld(url)
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text)
self._description(soup)
self._get_episodes(soup)
开发者ID:Faianca,项目名称:Anime-Tv-shows-Scrapper,代码行数:7,代码来源:series.py
示例17: run
def run(self, target):
domainname = get_tld("http://"+target)
whoisdomaincmd='whois ' + domainname + ' > whois-domain-' + domainname + '.txt'
print "Whois DOMAIN lookup cmd: " + whoisdomaincmd
print commands.getoutput(whoisdomaincmd)
开发者ID:xpn,项目名称:iRecon,代码行数:7,代码来源:iRecon.py
示例18: _record_time
def _record_time(self, request):
if hasattr(request, '_start_time'):
ms = int((time.time() - request._start_time) * 1000)
if request.is_ajax():
is_ajax = True
else:
is_ajax = False
is_authenticated = False
is_staff = False
is_superuser = False
if is_user_authenticated(request.user):
is_authenticated = True
if request.user.is_staff:
is_staff = True
if request.user.is_superuser:
is_superuser = True
referer = request.META.get('HTTP_REFERER')
referer_tld = None
referer_tld_string = ''
if referer:
try:
referer_tld = get_tld(referer, as_object=True)
except (TldBadUrl, TldDomainNotFound, TldIOError):
pass
if referer_tld:
referer_tld_string = referer_tld.tld
url = request.get_full_path()
url_query = parse.parse_qs(parse.urlparse(url).query)
# This allows you to measure click rates for ad-campaigns, just
# make sure that your ads have `?campaign=something` in the URL
campaign_keyword = getattr(
settings, 'INFLUXDB_METRICS_CAMPAIGN_KEYWORD', 'campaign')
campaign = ''
if campaign_keyword in url_query:
campaign = url_query[campaign_keyword][0]
data = [{
'measurement': 'django_request',
'tags': {
'host': settings.INFLUXDB_TAGS_HOST,
'is_ajax': is_ajax,
'is_authenticated': is_authenticated,
'is_staff': is_staff,
'is_superuser': is_superuser,
'method': request.method,
'module': request._view_module,
'view': request._view_name,
'referer': referer,
'referer_tld': referer_tld_string,
'full_path': url,
'path': request.path,
'campaign': campaign,
},
'fields': {'value': ms, },
}]
write_points(data)
开发者ID:bitmazk,项目名称:django-influxdb-metrics,代码行数:60,代码来源:middleware.py
示例19: extract_tld
def extract_tld(self, url):
try:
return get_tld(url)
except:
traceback.print_exc()
print "\n\nInvalid url: %s" % url
return url
开发者ID:rtapella,项目名称:memex-explorer,代码行数:7,代码来源:domain.py
示例20: __init__
def __init__(self, entry, site_hash=None ):
self.request = entry['request']
self.response = entry['response']
self.result = { 'url': None,
'url_sha512': None,
'ip': None,
'vhost': None,
'tld': None,
'ip_country': None,
'content_sha512': None,
'content_sha256': None,
'content_type': None ,
'content_encoding': None,
'content': None,
'in_alexa': False,
'http_status': None,
'redirect_url': None
}
self.result['url'] = self.request['url']
self.result['url_sha512'] = sha512(self.result['url']).hexdigest()
try:
self.result['tld'] = get_tld(self.result['url'])
except TldDomainNotFound:
pass
if 'serverIPAddress' in entry:
self.result['ip'] = entry['serverIPAddress']
for header in self.request['headers']:
if header['name'] == 'Host':
self.result['vhost'] = header['value']
开发者ID:aboutsecurity,项目名称:SiteParser,代码行数:34,代码来源:ParseHarEntry.py
注:本文中的tld.get_tld函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论