• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

Python tld.get_tld函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中tld.get_tld函数的典型用法代码示例。如果您正苦于以下问题:Python get_tld函数的具体用法?Python get_tld怎么用?Python get_tld使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了get_tld函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: _is_valid_link

    def _is_valid_link(self, link):
        """
        Return True if given link is non document, Since this is not a perfect way to check
        but it avoids a call to server. 
        """

        # Check ONLY_ROOTDOMAIN

        scheme, netloc, path, params, query, fragment = urlparse(link)

        try:
            if get_tld(self.base_url) == get_tld(link) and not ONLY_ROOTDOMAIN:
            # if get_tld(self.base_url) == get_tld(link):
                return False
        except Exception as e:
            log.error(str(e), self.base_url, link)


        # Need to add more
        DOC_EXT = [".pdf", ".xmls", ".docx", ".odt"]

        try:

            urlPath = [i for i in (path.split('/')) if i]

            file_name = urlPath[-1]
            ext = file_name.split(".")[-1]
        except IndexError:
            # Its just a root URL
            return True
        return ext not in DOC_EXT
开发者ID:muke5hy,项目名称:macaw,代码行数:31,代码来源:parser.py


示例2: get_sources_sites

def get_sources_sites(html, sites):
    """ (str, list of str) -> list of [str, str]
    Searches and returns links redirected to sites within the html
    links will be storing the whole url and the domain name used for searching.
    Returns empty list if none found

    Keyword arguments:
    html                -- string of html
    sites               -- list of site urls to look for
    """
    result_urls_matched = []
    result_urls_unmatched = []
    # Format the site to assure only the domain name for searching
    formatted_sites = []

    for site in sites:
        formatted_sites.append(tld.get_tld(site))

    for url in re.findall(
            "href=[\"\'][^\"\']*?.*?[^\"\']*?[\"\']", html, re.IGNORECASE):
        try:
            domain = tld.get_tld(url[6:-1])
        except (KeyboardInterrupt, SystemExit):
            raise
        except:
            continue
        if domain in formatted_sites:
            # If it matches even once, append the site to the list
            result_urls_matched.append([url[6:-1], domain])
        else:
            result_urls_unmatched.append([url[6:-1], domain])

    # Return the list
    return [result_urls_matched,result_urls_unmatched]
开发者ID:gitbronak,项目名称:Voyage,代码行数:34,代码来源:article_explorer.py


示例3: get_source_sites

def get_source_sites(urls, sites):
    """ (list of urls, list of source site urls)
    Return a list of expanded urls found in source urls,
    and a list of expanded urls not found in srouce urls.
    """
    result_urls_matched = []
    result_urls_unmatched = []
    formatted_source_sites = []
    for site in sites:
        formatted_source_sites.append(tld.get_tld(site))
    for url in urls:
        try:
            # with eventlet, the request will time out in 10s
            with eventlet.Timeout(10):
                real_url = requests.get(url, timeout=10).url
            domain = tld.get_tld(real_url)
        except:
            continue
        if domain in formatted_source_sites:
            # If it matches even once, append the site to the list
            result_urls_matched.append([real_url, domain])
        else:
            result_urls_unmatched.append([real_url, domain])

    return [result_urls_matched,result_urls_unmatched]
开发者ID:UTMediaCAT,项目名称:Voyage,代码行数:25,代码来源:twitter_crawler.py


示例4: get_sources_sites

def get_sources_sites(article, sites):
    """ (str, list of str) -> list of [str, str]
    Searches and returns links redirected to sites within the html
    links will be storing the whole url and the domain name used for searching.
    Returns empty list if none found

    Keyword arguments:
    html                -- string of html
    sites               -- list of site urls to look for
    """
    result_urls_matched = []
    result_urls_unmatched = []
    # Format the site to assure only the domain name for searching
    formatted_sites = set()

    for site in sites:
        formatted_sites.add(tld.get_tld(site))

    for url in article.get_links(article_text_links_only=True):
        try:
            domain = tld.get_tld(url.href)
        #apparently they don't inherit a common class so I have to hard code it
        except (tld.exceptions.TldBadUrl, tld.exceptions.TldDomainNotFound, tld.exceptions.TldIOError):
            continue
        if domain in formatted_sites:
            # If it matches even once, append the site to the list
            result_urls_matched.append([url.href, domain, url.text])
        else:
            result_urls_unmatched.append([url.href, domain, url.text])

    # Return the list
    return [result_urls_matched,result_urls_unmatched]
开发者ID:zhouwein,项目名称:Voyage,代码行数:32,代码来源:article_explorer.py


示例5: current_site_domain

def current_site_domain(request=None):
    try:
        if request:
            from tld import get_tld
            domain = get_tld('http://' + request.get_host())
        else:
            domain = settings.SUBDOMAIN_BASE_DOMAIN
    except Exception:
        from django.contrib.sites.models import Site
        try:
            if request:
                d = Site.objects.get_current(request=request).domain
            else:
                d = Site.objects.first().domain
            if d[0:4] != 'http':
                d = 'http://' + d
            domain = get_tld(d)
        except Exception:
            try:
                domain = settings.SUBDOMAIN_BASE_DOMAIN
            except Exception:
                domain = Site.objects.first().domain

    prefix = 'www.'
    if getattr(settings, 'REMOVE_WWW_FROM_DOMAIN', False) \
            and domain.startswith(prefix):
        domain = domain.replace(prefix, '', 1)

    return domain
开发者ID:marcfro,项目名称:django-subdomains,代码行数:29,代码来源:utils.py


示例6: analyze_url

def analyze_url(url):
    """
    Look just at the URL to see if a suitable title text can be found.  This
    method is much faster than actually visiting the URL to find the title
    element in the downloaded file. We want to do this for special sites like
    Facebook, which doesn't allow anonymous downloading of certain pages, like
    group pages.

    Args:
        url: A string that is a URL

    Returns:
        A string that is the title text to be used. If no suitable title text
        can be produced, return the empty string, "".
    """
    try:
        tl = get_tld(url)
    except (tld.exceptions.TldBadUrl, tld.exceptions.TldDomainNotFound):
        logging.debug("bad TLD; trying with http:// prefix")
        try:
            tl = get_tld("http://" + url)
        except (tld.exceptions.TldBadUrl, tld.exceptions.TldDomainNotFound):
            logging.debug("still bad TLD; giving up")
            return ""
    if tl == "facebook.com" and "facebook.com/groups/" in url:
            return "Facebook group page post"
    return ""
开发者ID:riceissa,项目名称:autolink,代码行数:27,代码来源:autolink-legacy.py


示例7: valid_domain

def valid_domain(domain):
    """This function return True if the passed domain is valid, false otherwise"""
    try:
        get_tld(domain,fix_protocol=True)
        return True
    except:
        return False
开发者ID:matteomattei,项目名称:servermaintenance,代码行数:7,代码来源:lemp_manager.py


示例8: get_source_sites

def get_source_sites(urls, sites):
    """ (status, list of str) -> list of str
    Searches and returns links redirected to sites within the urls
    of the tweet
    Returns empty list if none found

    Keyword arguments:
    tweet           -- Status structure to be searched through
    sites           -- List of site urls to look for
    """
    # store_all = configuration()['storage']['store_all_sources']

    result_urls_matched = []
    result_urls_unmatched = []
    formatted_sites = []

    for site in sites:
        formatted_sites.append(tld.get_tld(site))

    for url in urls:
        try:
            real_url = requests.get(url["expanded_url"], timeout=10).url
            domain = tld.get_tld(real_url)
        except:
            continue
        if domain in formatted_sites:
            # If it matches even once, append the site to the list
            result_urls_matched.append([real_url, domain])
        else:
            result_urls_unmatched.append([real_url, domain])

    # Return the list
    return [result_urls_matched, result_urls_unmatched]
开发者ID:gitbronak,项目名称:Voyage,代码行数:33,代码来源:twitter_explorer.py


示例9: parse

 def parse(self, response):
     domain = get_tld(response.url)
     items = []
     url_result = urlparse(response.url)
     top_domain = url_result.scheme + '://'+url_result.netloc
     
     for sel in response.xpath('//a/@href'):
         item = LinkscrawlItem()
         link = sel.extract()
         if link.find("http://") == 0 or link.find("https://") == 0 or link.find("www.") == 0:
             try:
                 target_domain = get_tld(link)
                 #print domain +"==================="+target_domain +"==================" + link
                 if domain != target_domain:
                     item['link'] = link
                     item['source'] = top_domain
                     yield item
                     #items.append(item)
                 else:
                     yield scrapy.Request(link,callback=self.parse)
             except:
                 print "The url can't get the domain. Ignored..." + link
                 
         if link.startswith('/'):
             yield scrapy.Request(top_domain+link, callback=self.parse)         
开发者ID:wonderfan,项目名称:python,代码行数:25,代码来源:LinkSpider.py


示例10: crawl

def crawl(n):
	i=0
	seed = []
	db = MySQLdb.connect(host='cspp53001.cs.uchicago.edu',db='jcbraunDB',user='jcbraun',passwd='3312crystal')
	cursor = db.cursor()
	outLinks = []
	if (n ==0):
		execString = ("SELECT URL, Domain FROM safeSeed WHERE crawled=0;") 
		cursor.execute(execString)
		seedx = cursor.fetchall()
		
	else:
		execString = ("SELECT URLTo FROM safeOutboundLinks WHERE lvl=%i;" % (n)) 
		cursor.execute(execString)
		seedx = cursor.fetchall()
		print seedx

	for row in seedx:
		i = i+1
		try:
			url = row[0]
			print url
			domain = get_tld(url, fail_silently=True)
			content = urllib2.urlopen(url, timeout=3).read(2000000)
			for k in re.findall('''href=["'](.[^"']+)["']''', content):			
				z = ((re.match('http://' , k) is not None) or (re.match('//' , k) is not None))
				y = re.match('/' , k)
				if (y):
					k = (("/").join((re.split("/", url)))+k)			
				if z or y:
					domainTo = (get_tld(k, fail_silently=True))
					print "domainTo is: %s" %domainTo
					reqURL = "https://sb-ssl.google.com/safebrowsing/api/lookup?client=f4p&key=AIzaSyCD0pNAG-6HVh_W6udGYZFz-2_p0yHDD5k&appver=31&pver=3.1&url=" + k
					response = urllib2.urlopen(reqURL).getcode()
					if (response==200):
						print ("Found dangerous site \n")
						bad = 1
						execString = ("INSERT INTO inboundLinks (Domain, domainTo, URL, URLto, Crawled) VALUES ('%s', '%s', '%s', '%s', 'false');" % (domain, domainTo, url, k))
						cursor.execute(execString)
					else:
						bad = 0
						execString = ("INSERT INTO safeOutboundLinks (Lvl, Domain, domainTo, URL, URLto, Crawled, toSpam) VALUES ('%i', '%s', '%s', '%s', '%s', '0', '%i');" % ((n+1), domain, domainTo, url, k, bad))
						cursor.execute(execString)
						print("adding %s" %k)
					db.commit()	
			bank = open('notspam/%d.txt' %i, 'w')
			bank.write (content)
			content=db.escape_string(content)
			execString = ("INSERT INTO safeContent (Lvl, Content, Domain, URL, CopySource) VALUES ('%i', '%s', '%s', '%s', 'crawl');" % ((n+1), content, domain, url)) 
			cursor.execute(execString)
			print url + " success! \n"
			bank.close()
			
		except Exception as e:
			print ("Broken link to %s" %url)	
			print (type(e))
			print (e.args)
	db.commit()
	db.close()
开发者ID:0x0mar,项目名称:Fishing-for-Phishing,代码行数:59,代码来源:safeCrawl.py


示例11: _from_same_site

 def _from_same_site(self, ads_host, ads_target):
     if ads_target is None:
         return True
     if not ads_target.startswith("http"):
         return True
     ads_host_domain = get_tld(ads_host, as_object=True).domain
     ads_target_domain = get_tld(ads_target, as_object=True).domain
     return True if ads_host_domain == ads_target_domain else False
开发者ID:yuanbei,项目名称:adspider,代码行数:8,代码来源:ads_profile_spider.py


示例12: valid_a_href

def valid_a_href(a_elements, main_url=None):
    hrefs = [a.get('href') for a in a_elements]
    hrefs = [link for link in hrefs if link and link.startswith('http://')]
    if main_url:
        main_tld = get_tld(main_url, fail_silently=True)
        hrefs = [link for link in hrefs if get_tld(link, fail_silently=True) == main_tld]

    return hrefs
开发者ID:shenxiangq,项目名称:news_crawler,代码行数:8,代码来源:util.py


示例13: get_link_text

def get_link_text(url, mime_type, data=None):
    '''
    Take URL, MIME type, and optional data to produce the link text.
    '''
    tld = get_tld(url)
    result = "File on " + tld
    if mime_type.startswith("image"):
        result = "Image on " + tld
    elif mime_type == "application/pdf":
        result = "PDF on " + tld
    elif "text/html" in mime_type:
        try:
            soup = BeautifulSoup(data, 'html.parser')
            meta = soup.find_all("meta")
            possible = [i.get("content") for i in meta if i.get("property") == "og:title"]
            if possible:
                result = possible[0].strip()
            elif soup.title.string:
                result = messy_title_parse(soup.title.string)
            else:
                result = "Page on " + tld
        except AttributeError:
            # Probably just empty title when trying to get
            # soup.title.string
            result = "Page on " + tld
    if len(result) > 255:
        result = result[:253] + " …"

    return result
开发者ID:riceissa,项目名称:static-site-generator,代码行数:29,代码来源:autolink.py


示例14: get_monthly_archive_urls

def get_monthly_archive_urls(links, page_url):
    '''Scans the provided links for blogspot style archives which are of
    the form website.com/yyyy_dd_01_archive.html'''
    domain = get_tld(page_url)
    monthly_archive_urls = []
    for link in links:
        # Try for drop down lists using <option value="url.com/...">
        try:
            url = link.attrs['value']
            match = re.search(domain + "/\d{4}_\d{2}_01_archive.html", url)
            if match:
                monthly_archive_urls.append(url)

        except KeyError:
            pass

        # Try for actual <a href="url.com/..." > links
        try:
            url = link.attrs['href']
            match = re.search(domain + "/\d{4}_\d{2}_01_archive.html", url)
            if match:
                monthly_archive_urls.append(url)

        except KeyError:
            pass

    return list(set(monthly_archive_urls))
开发者ID:malnoxon,项目名称:rsstory,代码行数:27,代码来源:blogspot.py


示例15: bulk_insert_urls

	def bulk_insert_urls(self, content):
		logger.debug("in bulk insert urls")

		for line in content:

			items = line.split()

			if len(items) < 9:
				logger.error("error parsing line")
				logger.error(line)
			else:
				if ("http" in items[8]  and "//" in items[8]):
					parts  = items[8].split("//")[1].split("/")

					domain = parts[0]
					res = get_tld(items[8], as_object=True, fail_silently=True)

					if res is not None:
						tld = "%s.%s" % (res.domain, res.suffix)
					else:
						tld = parts[0]
					path = ""
					if len(parts) > 0:
						path = "".join(parts[1:])

					url = {'ts':items[2].split(".")[0], 'host':items[4], 'tld':tld, 'domain':domain, 'path': path}
					try:
						logger.debug("inserting %s %s %s" % (url['ts'], url['host'], url['tld']))
						self.conn.execute("INSERT INTO URLS(ts, host, tld, domain, path, datasource) VALUES(?,?,?,?,?,?)", (url['ts'], url['host'],url['tld'], url['domain'], url['path'], 'squid'))
					except Exception, e:
						logger.error("error inserting url %s" % str(url))
开发者ID:ucn-eu,项目名称:ucnviz,代码行数:31,代码来源:datadb.py


示例16: scrap

    def scrap(self, url):
        self.external_domain = "http://"+get_tld(url)
        response = requests.get(url)
        soup = bs4.BeautifulSoup(response.text)

        self._description(soup)
        self._get_episodes(soup)
开发者ID:Faianca,项目名称:Anime-Tv-shows-Scrapper,代码行数:7,代码来源:series.py


示例17: run

	def run(self, target):
		
		domainname = get_tld("http://"+target)
		
		whoisdomaincmd='whois ' + domainname + ' > whois-domain-' + domainname + '.txt'
		print "Whois DOMAIN lookup cmd: " + whoisdomaincmd
		print commands.getoutput(whoisdomaincmd)
开发者ID:xpn,项目名称:iRecon,代码行数:7,代码来源:iRecon.py


示例18: _record_time

    def _record_time(self, request):
        if hasattr(request, '_start_time'):
            ms = int((time.time() - request._start_time) * 1000)
            if request.is_ajax():
                is_ajax = True
            else:
                is_ajax = False

            is_authenticated = False
            is_staff = False
            is_superuser = False
            if is_user_authenticated(request.user):
                is_authenticated = True
                if request.user.is_staff:
                    is_staff = True
                if request.user.is_superuser:
                    is_superuser = True

            referer = request.META.get('HTTP_REFERER')
            referer_tld = None
            referer_tld_string = ''
            if referer:
                try:
                    referer_tld = get_tld(referer, as_object=True)
                except (TldBadUrl, TldDomainNotFound, TldIOError):
                    pass
            if referer_tld:
                referer_tld_string = referer_tld.tld

            url = request.get_full_path()
            url_query = parse.parse_qs(parse.urlparse(url).query)

            # This allows you to measure click rates for ad-campaigns, just
            # make sure that your ads have `?campaign=something` in the URL
            campaign_keyword = getattr(
                settings, 'INFLUXDB_METRICS_CAMPAIGN_KEYWORD', 'campaign')
            campaign = ''
            if campaign_keyword in url_query:
                campaign = url_query[campaign_keyword][0]

            data = [{
                'measurement': 'django_request',
                'tags': {
                    'host': settings.INFLUXDB_TAGS_HOST,
                    'is_ajax': is_ajax,
                    'is_authenticated': is_authenticated,
                    'is_staff': is_staff,
                    'is_superuser': is_superuser,
                    'method': request.method,
                    'module': request._view_module,
                    'view': request._view_name,
                    'referer': referer,
                    'referer_tld': referer_tld_string,
                    'full_path': url,
                    'path': request.path,
                    'campaign': campaign,
                },
                'fields': {'value': ms, },
            }]
            write_points(data)
开发者ID:bitmazk,项目名称:django-influxdb-metrics,代码行数:60,代码来源:middleware.py


示例19: extract_tld

 def extract_tld(self, url):
     try:
         return get_tld(url)
     except:
         traceback.print_exc()
         print "\n\nInvalid url: %s" % url
         return url
开发者ID:rtapella,项目名称:memex-explorer,代码行数:7,代码来源:domain.py


示例20: __init__

    def __init__(self, entry, site_hash=None ):
        self.request = entry['request']
        self.response = entry['response']

        self.result = { 'url': None, 
                'url_sha512': None,
                'ip': None, 
                'vhost': None, 
                'tld': None, 
                'ip_country': None, 
                'content_sha512': None, 
                'content_sha256': None, 
                'content_type': None , 
                'content_encoding': None,
                'content': None,
                'in_alexa': False,
                'http_status': None,
                'redirect_url': None
                }

        self.result['url'] = self.request['url']
        self.result['url_sha512'] = sha512(self.result['url']).hexdigest()

        try:
            self.result['tld'] = get_tld(self.result['url'])
        except TldDomainNotFound:
            pass

        if 'serverIPAddress' in entry: 
            self.result['ip'] = entry['serverIPAddress']

        for header in self.request['headers']:
            if header['name'] == 'Host':
                self.result['vhost'] = header['value']
开发者ID:aboutsecurity,项目名称:SiteParser,代码行数:34,代码来源:ParseHarEntry.py



注:本文中的tld.get_tld函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python tldextract.extract函数代码示例发布时间:2022-05-27
下一篇:
Python exception.handle_exception函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap