• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

Python urlparse.urljoin函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中urlparse.urljoin函数的典型用法代码示例。如果您正苦于以下问题:Python urljoin函数的具体用法?Python urljoin怎么用?Python urljoin使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了urljoin函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: get_sub

 def get_sub(self):
     """Fetches the subtitles from addic7ed from url specified in given database (db) for that episode"""
     url_split = urlparse.urlsplit (self.url)
     head,  tail = url_split.path.rsplit ('/', 1)
     new_path =  head,  'addic7ed'
     referer = urlparse.urlunsplit(url_split._replace(path=urlparse.urljoin(*new_path)))
     
     domain = self.url
     response = urllib2.urlopen(domain)#Opens the url
     html = response.read ()#loads the html code
     soup = BeautifulSoup (html)#interprets (parse?) the html code
     links = []
     for x in soup.find_all (class_ ="buttonDownload"):
         links.append (x.attrs['href'])
     
     domain = 'http://www.addic7ed.com/'
     urls = []
     for link in links:
         urls.append (urlparse.urljoin (domain, link))
     
     page = urls[0]
     req = urllib2.Request(page,  headers ={'User-Agent' : 'Mozilla 5.10', 'Referer' : referer})
     response = urllib2.urlopen (req)
     data = response.read()
     
     test = response.info()
     print test
     
     if response.info().has_key('Content-Disposition'):
         with open(os.path.join(self.db.env.subs_dir ,'%s.srt' % self.title), 'wb') as f:
             f.write(data)
     else:
         return response.info()
开发者ID:urucaro,项目名称:edu,代码行数:33,代码来源:__init__.py


示例2: handle_captcha

 def handle_captcha(self, response, solver):
     sel = scrapy.Selector(response)
     iframe_src = sel.xpath(self.CAPTCHA_XPATH).extract()[0]
     iframe_url = urljoin(response.url, iframe_src)
     iframe_request = scrapy.Request(iframe_url)
     iframe_response = yield download(self.crawler, iframe_request)
     iframe_sel = scrapy.Selector(iframe_response)
     img_src, = iframe_sel.xpath('//img/@src').extract()[:1] or [None]
     if img_src is None:
         raise DecaptchaError('No //img/@src found on CAPTCHA page')
     img_url = urljoin(iframe_response.url, img_src)
     img_request = scrapy.Request(img_url)
     img_response = yield download(self.crawler, img_request)
     scrapy.log.msg('CAPTCHA image downloaded, solving')
     captcha_text = yield solver.solve(img_response.body)
     scrapy.log.msg('CAPTCHA solved: %s' % captcha_text)
     challenge_request = scrapy.FormRequest.from_response(
         iframe_response, formxpath='//form',
         formdata={'recaptcha_response_field': captcha_text}
     )
     challenge_response = yield download(self.crawler, challenge_request)
     challenge_sel = scrapy.Selector(challenge_response)
     challenge, = challenge_sel.xpath(
         '//textarea/text()'
     ).extract()[:1] or [None]
     if not challenge:
         raise DecaptchaError('Bad challenge from reCAPTCHA API:\n%s' %
                              challenge_response.body)
     scrapy.log.msg('CAPTCHA solved, submitting challenge')
     submit_request = scrapy.FormRequest.from_response(
         response, formxpath='//form[.%s]' % self.CAPTCHA_XPATH,
         formdata={'recaptcha_challenge_field': challenge}
     )
     yield download(self.crawler, submit_request)
开发者ID:JonathanBowker,项目名称:decaptcha,代码行数:34,代码来源:recaptcha.py


示例3: sources

    def sources(self, url, hostDict, hostprDict):
        try:
            sources = []

            if url == None: return sources
            url = urlparse.urljoin(self.base_link, url)
            for i in range(3):
                result = client.request(url, timeout=10)
                if not result == None: break
            
            dom = dom_parser.parse_dom(result, 'div', attrs={'class':'links', 'id': 'noSubs'})
            result = dom[0].content
            
            links = re.compile('<tr\s*>\s*<td><i\s+class="fa fa-youtube link-logo"></i>([^<]+).*?href="([^"]+)"\s+class="watch',re.DOTALL).findall(result)         
            for link in links[:5]:
                try:
                    url2 = urlparse.urljoin(self.base_link, link[1])
                    for i in range(2):
                        result2 = client.request(url2, timeout=3)
                        if not result2 == None: break                    
                    r = re.compile('href="([^"]+)"\s+class="action-btn').findall(result2)[0]
                    valid, hoster = source_utils.is_host_valid(r, hostDict)
                    if not valid: continue
                    urls, host, direct = source_utils.check_directstreams(r, hoster)
                    for x in urls: sources.append({'source': host, 'quality': x['quality'], 'language': 'en', 'url': x['url'], 'direct': direct, 'debridonly': False})
                    
                except:
                    #traceback.print_exc()
                    pass           
                    
            return sources
        except:
            return sources
开发者ID:vphuc81,项目名称:MyRepository,代码行数:33,代码来源:seriesfree.py


示例4: novedades

def novedades(item):
    logger.info("[serieonline.py] novedades")

    # Descarga la página
    data = scrapertools.cachePage(item.url)

    # Extrae las entradas
    patronvideos  = '<a href="([^"]+)" title="([^"]+)"><img src="([^"]+)" alt="([^"]+)" class="captify" /></a>'

    matches = re.compile(patronvideos,re.DOTALL).findall(data)
    if DEBUG: scrapertools.printMatches(matches)

    itemlist = []
    for match in matches:
        scrapedtitle = match[1] + " " + match[3]
        scrapedplot = ""
        scrapedurl = urlparse.urljoin(item.url,match[0])
        scrapedthumbnail = urlparse.urljoin(item.url,match[2])
        if (DEBUG): logger.info("title=["+scrapedtitle+"], url=["+scrapedurl+"], thumbnail=["+scrapedthumbnail+"]")

        # Añade al listado de XBMC
        itemlist.append( Item(channel=CHANNELNAME, action="findvideos", title=scrapedtitle , url=scrapedurl , thumbnail=scrapedthumbnail , plot=scrapedplot , folder=True) )

    # Extrae el paginador
    patronvideos  = '<div class="paginacion-num"><a href="([^"]+)">'
    matches = re.compile(patronvideos,re.DOTALL).findall(data)
    scrapertools.printMatches(matches)

    if len(matches)>0:
        scrapedtitle = "Página siguiente"
        scrapedurl = urlparse.urljoin(item.url,matches[0])
        itemlist.append( Item(channel=CHANNELNAME, action="novedades", title=scrapedtitle , url=scrapedurl , folder=True) )

    return itemlist
开发者ID:jorik041,项目名称:pelisalacarta-personal-fork,代码行数:34,代码来源:serieonline.py


示例5: getL10nRepositories

def getL10nRepositories(changesets, l10nRepoPath, relbranch=None):
    """Parses a list of locale names and revisions for their associated
       repository from the 'changesets' string passed in."""
    # urljoin() will strip the last part of l10nRepoPath it doesn't end with
    # "/"
    if not l10nRepoPath.endswith('/'):
        l10nRepoPath = l10nRepoPath + '/'
    repositories = {}
    try:
        for locale, data in json.loads(changesets).iteritems():
            locale = urljoin(l10nRepoPath, locale)
            repositories[locale] = {
                'revision': data['revision'],
                'relbranchOverride': relbranch,
                'bumpFiles': []
            }
    except (TypeError, ValueError):
        for locale, revision in parsePlainL10nChangesets(changesets).iteritems():
            if revision == 'FIXME':
                raise Exception('Found FIXME in changesets for locale "%s"' % locale)
            locale = urljoin(l10nRepoPath, locale)
            repositories[locale] = {
                'revision': revision,
                'relbranchOverride': relbranch,
                'bumpFiles': []
            }

    return repositories
开发者ID:MihaiTabara,项目名称:build-tools,代码行数:28,代码来源:l10n.py


示例6: __init__

 def __init__(self, layer, mapfile, fonts=None):
     """ Initialize Mapnik provider with layer and mapfile.
         
         XML mapfile keyword arg comes from TileStache config,
         and is an absolute path by the time it gets here.
     """
     maphref = urljoin(layer.config.dirpath, mapfile)
     scheme, h, path, q, p, f = urlparse(maphref)
     
     if scheme in ('file', ''):
         self.mapfile = path
     else:
         self.mapfile = maphref
     
     self.layer = layer
     self.mapnik = None
     
     engine = mapnik.FontEngine.instance()
     
     if fonts:
         fontshref = urljoin(layer.config.dirpath, fonts)
         scheme, h, path, q, p, f = urlparse(fontshref)
         
         if scheme not in ('file', ''):
             raise Exception('Fonts from "%s" can\'t be used by Mapnik' % fontshref)
     
         for font in glob(path.rstrip('/') + '/*.ttf'):
             engine.register_font(str(font))
开发者ID:Outdooractive,项目名称:TileStache,代码行数:28,代码来源:Mapnik.py


示例7: processJob

def processJob(jobDetails):
    try:
        job = {}
        url = urljoin(rootUrl, jobDetails.a['href'])
        soup = thisInstitution.getSoup(url)
        subLinks = soup.select('.pinkbox_heading a')
        if subLinks:
            for link in subLinks:
                job['url'] = urljoin(rootUrl, link['href'])
                job['title'] = link.get_text()
                print job['title']
                job["language"] = 'de'                
                jobPage = thisInstitution.getSoup(job['url'])
                content = jobPage.find(id='contentblock')
                job['text'] = unicode(content)
                thisInstitution.addRecord(job) 
        else:
            job['url'] = url
            job['title'] = jobDetails.a.get_text()
            print job['title']
            job["language"] = 'de'                
            content = soup.find(id='contentblock')
            job['text'] = unicode(content)          
            thisInstitution.addRecord(job)  
    except Exception as e:
        print e
        # record the error with the shared code and continue on to the next url
        thisInstitution.error(e.message, job)
        return False
开发者ID:ayat-ra,项目名称:scraping,代码行数:29,代码来源:helmutSchmidtUniversity.py


示例8: parse

 def parse(self, response):
     self._logger.info("start response in parse -> response type:%s"%type(response).__name__)
     item_urls = [
         urljoin(response.url, x) for x in list(set(
             response.xpath('//div[@id="resultsCol"]//div[@class="a-row a-spacing-none"]/a[@class="a-link-normal a-text-normal"]/@href').extract()
         ))
     ]
     self.crawler.stats.inc_total_pages(response.meta['crawlid'], response.meta['spiderid'], response.meta['appid'], len(item_urls))
     for item_url in item_urls:
         yield Request(url=item_url,
                       callback=self.parse_item,
                       meta=response.meta)
     workers = response.meta.get('workers', {})
     for worker in workers.keys():
         workers[worker] = 0
     if "if_next_page" in response.meta: del response.meta["if_next_page"]
     next_page_urls = [
         urljoin(response.url, x) for x in list(set(
             response.xpath('//div[@id="pagn"]//span[@class="pagnRA"]/a/@href').extract()
         ))
     ]
     response.meta["if_next_page"] = True
     for next_page_url in next_page_urls:
         yield Request(url=next_page_url,
                       callback=self.parse,
                       meta=response.meta)
开发者ID:mtaziz,项目名称:jaycluster,代码行数:26,代码来源:amazon_spider.py


示例9: __search

    def __search(self, titles, type, year, season=0, episode=False):
        try:
            years = [str(year), str(int(year) + 1), str(int(year) - 1)]
            years = ['&veroeffentlichung[]=%s' % i for i in years]

            query = self.search_link % (type, urllib.quote_plus(cleantitle.query(titles[0])))
            query += ''.join(years)
            query = urlparse.urljoin(self.base_link, query)

            t = [cleantitle.get(i) for i in set(titles) if i]

            r = self.__proceed_search(query)
            r = [i[0] for i in r if cleantitle.get(i[1]) in t and int(i[2]) == int(season)][0]

            url = source_utils.strip_domain(r)
            if episode:
                r = client.request(urlparse.urljoin(self.base_link, url))
                r = dom_parser.parse_dom(r, 'div', attrs={'class': 'season-list'})
                r = dom_parser.parse_dom(r, 'li')
                r = dom_parser.parse_dom(r, 'a', req='href')
                r = [i.attrs['href'] for i in r if i and int(i.content) == int(episode)][0]

                url = source_utils.strip_domain(r)
            return url
        except:
            return
开发者ID:azumimuo,项目名称:family-xbmc-addon,代码行数:26,代码来源:meinkino.py


示例10: mainlist

def mainlist(item):
    logger.info()

    thumb_series    = get_thumb("squares", "thumb_canales_series.png")
    thumb_series_az = get_thumb("squares", "thumb_canales_series_az.png")
    thumb_buscar    = get_thumb("squares", "thumb_buscar.png")

    itemlist = []
    itemlist.append(Item(channel=item.channel, title="Listado alfabético", action="series_listado_alfabetico",
                         thumbnail=thumb_series_az))
    itemlist.append(Item(channel=item.channel, title="Todas las series", action="series",
                         url=urlparse.urljoin(HOST, "listado/"), thumbnail=thumb_series))
    itemlist.append(Item(channel=item.channel, title="Capítulos de estreno", action="homeSection", extra=CAPITULOS_DE_ESTRENO_STR,
                         url=HOST , thumbnail=thumb_series))
    itemlist.append(Item(channel=item.channel, title="Último actualizado", action="homeSection", extra="Último Actualizado",
                         url=HOST , thumbnail=thumb_series))
    itemlist.append(Item(channel=item.channel, title="Series más vistas", action="homeSection", extra="Series Más vistas",
                         url=HOST , thumbnail=thumb_series))
    itemlist.append(Item(channel=item.channel, title="Series menos vistas", action="homeSection", extra="Series Menos vistas",
                         url=HOST , thumbnail=thumb_series))
    itemlist.append(Item(channel=item.channel, title="Últimas fichas creadas", action="series",
                         url=urlparse.urljoin(HOST, "fichas_creadas/"), thumbnail=thumb_series))

    itemlist.append(Item(channel=item.channel, title="Buscar...", action="search", url=HOST, thumbnail=thumb_buscar))

    if filtertools.context:
        itemlist = filtertools.show_option(itemlist, item.channel, list_idiomas, CALIDADES)

    return itemlist
开发者ID:neno1978,项目名称:pelisalacarta,代码行数:29,代码来源:seriesblanco.py


示例11: parseImgLinks

	def parseImgLinks(self,depth=1):
		url_response = None
		try:
			url_response = urllib2.urlopen(self.scrap_url,timeout=self._timeout)
		except Exception as e:
			print("   [ERROR]: Could not open {0}: {1}".format(self.scrap_url,e.reason))
			return self.img_list

		html_parse = BeautifulSoup(url_response)
		unique_images_found = 0
		total_images_found = 0
		self.visited[self.scrap_url] = 1

		for img in html_parse.findAll('img'):
			try:
				abs_url = urljoin(self.scrap_url,img['src']) if urlparse(img['src']).netloc == "" else img['src']
				if abs_url not in self.img_list:
					self.img_list.add(abs_url)
					unique_images_found += 1
				total_images_found += 1
			except:
				pass

		print("   [Found %d images / %d new]: %s" % (total_images_found,unique_images_found,self.scrap_url))
		if depth > 1:
			for a in html_parse.findAll('a'):
				try:
					if (urlparse(a['href']).netloc == "") or (urlparse(self.scrape_url_orig).netloc == urlparse(a['href']).netloc):
						self.scrap_url = urljoin(self.scrape_url_orig,a['href'])
						if self.scrap_url in self.visited: continue
						self.parseImgLinks(depth - 1)
				except:
					pass
		return self.img_list
开发者ID:erktheerk,项目名称:image-scraper,代码行数:34,代码来源:__init__.py


示例12: episodios

def episodios(item):
    logger.info("{0} - {1}".format(item.title, item.url))

    itemlist = []

    # Descarga la página
    data = scrapertools.cache_page(item.url)

    fanart = scrapertools.find_single_match(data, "background-image[^'\"]+['\"]([^'\"]+)")
    plot = scrapertools.find_single_match(data, "id=['\"]profile2['\"]>\s*(.*?)\s*</div>")

    logger.debug("fanart: {0}".format(fanart))
    logger.debug("plot: {0}".format(plot))


    episodes = re.findall("<tr.*?href=['\"](?P<url>[^'\"]+).+?>(?P<title>.+?)</a>.*?<td>(?P<flags>.*?)</td>", data, re.MULTILINE | re.DOTALL)
    for url, title, flags in episodes:
        idiomas = " ".join(["[{0}]".format(IDIOMAS.get(language, "OVOS")) for language in re.findall("banderas/([^\.]+)", flags, re.MULTILINE)])
        displayTitle = "{show} - {title} {languages}".format(show = item.show, title = title, languages = idiomas)
        logger.debug("Episode found {0}: {1}".format(displayTitle, urlparse.urljoin(HOST, url)))
        itemlist.append(item.clone(title=displayTitle, url=urlparse.urljoin(HOST, url),
                                   action="findvideos", plot=plot, fanart=fanart, language=idiomas,
                                   list_idiomas=list_idiomas, list_calidad=CALIDADES, context=filtertools.context))

    if len(itemlist) > 0 and filtertools.context:
        itemlist = filtertools.get_links(itemlist, item.channel)

    if config.get_library_support() and len(itemlist) > 0:
        itemlist.append(item.clone(title="Añadir esta serie a la biblioteca", action="add_serie_to_library", extra="episodios"))

    return itemlist
开发者ID:neno1978,项目名称:pelisalacarta,代码行数:31,代码来源:seriesblanco.py


示例13: choose_reference

def choose_reference(experiment, biorep_n, server, keypair, sex_specific):

    replicates = [common.encoded_get(urlparse.urljoin(server,rep_uri), keypair, frame='embedded') for rep_uri in experiment['replicates']]
    replicate = next(rep for rep in replicates if rep.get('biological_replicate_number') == biorep_n)
    logging.debug('Replicate uuid %s' %(replicate.get('uuid')))
    organism_uri = replicate.get('library').get('biosample').get('organism')
    organism_obj = common.encoded_get(urlparse.urljoin(server,organism_uri), keypair)

    try:
        organism_name = organism_obj['name']
    except:
        logging.error('%s:rep%d Cannot determine organism.' %(experiment.get('accession'), biorep_n))
        raise
        return None
    else:
        logging.debug("Organism name %s" %(organism_name))

    if sex_specific:
        try:
            sex = replicate.get('library').get('biosample').get('sex')
            assert sex in ['male', 'female']
        except:
            logging.warning('%s:rep%d Sex is %s.  Mapping to male reference.' %(experiment.get('accession'), biorep_n, sex))
            sex = 'male'

        logging.debug('Organism %s sex %s' %(organism_name, sex))
    else:
        sex = 'male'
    
    genome_assembly = args.assembly

    reference = next((ref.get('file') for ref in REFERENCES if ref.get('organism') == organism_name and ref.get('sex') == sex and ref.get('assembly') == genome_assembly), None)
    logging.debug('Found reference %s' %(reference))
    return reference
开发者ID:anwesharry,项目名称:chip-seq-pipeline,代码行数:34,代码来源:map_only.py


示例14: check_page

    def check_page(self, page):

        self.marionette.navigate(urlparse.urljoin(self.server_prefix, page))
        try:
            self.marionette.find_element("id", 'complete')
        except NoSuchElementException:
            fullPageUrl = urlparse.urljoin(self.relPath, page)

            details = "%s: 1 failure encountered\n%s" % \
                      (fullPageUrl,
                       self.get_failure_summary(
                           fullPageUrl, "Waiting for Completion",
                           "Could not find the test complete indicator"))

            raise AssertionError(details)

        fail_node = self.marionette.find_element("css selector",
                                                 '.failures > em')
        if fail_node.text == "0":
            return

        # This may want to be in a more general place triggerable by an env
        # var some day if it ends up being something we need often:
        #
        # If you have browser-based unit tests which work when loaded manually
        # but not from marionette, uncomment the two lines below to break
        # on failing tests, so that the browsers won't be torn down, and you
        # can use the browser debugging facilities to see what's going on.
        #from ipdb import set_trace
        #set_trace()

        raise AssertionError(self.get_failure_details(page))
开发者ID:AtulKumar2,项目名称:gecko-dev,代码行数:32,代码来源:frontend_tester.py


示例15: as_obi_serialization

    def as_obi_serialization(self, request=None):
        """Produce an Open Badge Infrastructure serialization of this badge"""
        if request:
            base_url = request.build_absolute_uri('/')
        else:
            base_url = 'http://%s' % (Site.objects.get_current().domain,)

        # see: https://github.com/brianlovesdata/openbadges/wiki/Assertions
        if not self.creator:
            issuer = SITE_ISSUER
        else:
            issuer = {
                # TODO: Get from user profile instead?
                "origin": urljoin(base_url, self.creator.get_absolute_url()),
                "name": self.creator.username,
                "contact": self.creator.email
            }

        data = {
            # The version of the spec/hub this manifest is compatible with. Use
            # "0.5.0" for the beta.
            "version": OBI_VERSION,
            # TODO: truncate more intelligently
            "name": self.title[:128],
            # TODO: truncate more intelligently
            "description": self.description[:128],
            "criteria": urljoin(base_url, self.get_absolute_url()),
            "issuer": issuer
        }
        if self.image:
            data['image'] = urljoin(base_url, self.image.url)
        return data
开发者ID:johnsca,项目名称:django-badger,代码行数:32,代码来源:models.py


示例16: get_sources

    def get_sources(self, url, hosthdDict, hostDict, locDict):
        try:
            sources = []

            if url == None: return sources

            url = urlparse.urljoin(self.base_link, url)

            result = client.source(url)

            video_id = re.compile('video_id *= *[\'|\"](.+?)[\'|\"]').findall(result)[0]
            post = urllib.urlencode({'video_id': video_id})

            result = client.source(urlparse.urljoin(self.base_link, self.info_link), post=post)

            u = [i for i in result.split('&') if 'google' in i][0]
            u = urllib.unquote_plus(u)
            u = [urllib.unquote_plus(i.split('|')[-1]) for i in u.split(',')]
            u = [googleplus.tag(i)[0] for i in u]
            u = [i for i in u if i['quality'] in ['1080p', 'HD']]

            for i in u: sources.append({'source': 'GVideo', 'quality': i['quality'], 'provider': 'Afdah', 'url': i['url']})

            return sources
        except:
            return sources
开发者ID:marnnie,项目名称:Cable-buenaventura,代码行数:26,代码来源:afdah_mv.py


示例17: get_sources

 def get_sources(self, video):
     source_url = self.get_url(video)
     hosters = []
     if source_url and source_url != FORCE_NO_MATCH:
         url = urlparse.urljoin(self.base_url, source_url)
         page_html = self._http_get(url, cache_limit=.5)
         movie_id = dom_parser.parse_dom(page_html, 'div', {'id': 'media-player'}, 'movie-id')
         if movie_id:
             server_url = SL_URL % (movie_id[0])
             url = urlparse.urljoin(self.base_url, server_url)
             html = self._http_get(url, cache_limit=.5)
             sources = {}
             for match in re.finditer('changeServer\(\s*(\d+)\s*,\s*(\d+)\s*\).*?class="btn-eps[^>]*>([^<]+)', html, re.DOTALL):
                 link_type, link_id, q_str = match.groups()
                 if link_type in ['12', '13', '14']:
                     url = urlparse.urljoin(self.base_url, PLAYLIST_URL1 % (link_id))
                     sources.update(self.__get_link_from_json(url, q_str))
                 else:
                     media_url = self.__get_ep_pl_url(link_type, page_html)
                     if media_url:
                         url = urlparse.urljoin(self.base_url, media_url)
                         xml = self._http_get(url, cache_limit=.5)
                         sources.update(self.__get_links_from_xml(xml, video))
             
         for source in sources:
             if sources[source]['direct']:
                 host = self._get_direct_hostname(source)
             else:
                 host = urlparse.urlparse(source).hostname
             hoster = {'multi-part': False, 'host': host, 'class': self, 'quality': sources[source]['quality'], 'views': None, 'rating': None, 'url': source, 'direct': sources[source]['direct']}
             hosters.append(hoster)
     return hosters
开发者ID:lidormalicb,项目名称:salts,代码行数:32,代码来源:123movies_scraper.py


示例18: get_sources

    def get_sources(self, video):
        source_url = self.get_url(video)
        hosters = []
        if source_url and source_url != FORCE_NO_MATCH:
            page_url = urlparse.urljoin(self.base_url, source_url)
            html = self._http_get(page_url, cache_limit=.25)
            match = re.search('''<option[^>]+value\s*=\s*["']([^"']+)[^>]*>(?:Altyaz.{1,3}s.{1,3}z)<''', html)
            if match:
                option_url = urlparse.urljoin(self.base_url, match.group(1))
                html = self._http_get(option_url, cache_limit=.25)
                fragment = dom_parser.parse_dom(html, 'span', {'class': 'object-wrapper'})
                if fragment:
                    iframe_url = dom_parser.parse_dom(fragment[0], 'iframe', ret='src')
                    if iframe_url:
                        html = self._http_get(iframe_url[0], cache_limit=.25)

                        seen_urls = {}
                        for match in re.finditer('"?file"?\s*:\s*"([^"]+)"\s*,\s*"?label"?\s*:\s*"(\d+)p?[^"]*"', html):
                            stream_url, height = match.groups()
                            if stream_url not in seen_urls:
                                seen_urls[stream_url] = True
                                stream_url += '|User-Agent=%s' % (scraper_utils.get_ua())
                                host = self._get_direct_hostname(stream_url)
                                if host == 'gvideo':
                                    quality = scraper_utils.gv_get_quality(stream_url)
                                else:
                                    quality = scraper_utils.height_get_quality(height)
                                hoster = {'multi-part': False, 'host': self._get_direct_hostname(stream_url), 'class': self, 'quality': quality, 'views': None, 'rating': None, 'url': stream_url, 'direct': True}
                                hosters.append(hoster)
    
        return hosters
开发者ID:c0ns0le,项目名称:YCBuilds,代码行数:31,代码来源:dizibox_scraper.py


示例19: search

def search(item,texto):
    logger.info("[pelisalacarta.seriesblanco search texto="+texto)

    itemlist = []

    item.url = urlparse.urljoin(host,"/search.php?q1=%s" % (texto))
    data = scrapertools.cache_page(item.url)
    data = re.sub(r"\n|\r|\t|\s{2}|&nbsp;|<Br>|<BR>|<br>|<br/>|<br />|-\s","",data)
    data = re.sub(r"<!--.*?-->","",data)

    #<div style='float:left;width: 620px;'><div style='float:left;width: 33%;text-align:center;'><a href='/serie/20/against-the-wall.html' '><img class='ict' src='http://4.bp.blogspot.com/-LBERI18Cq-g/UTendDO7iNI/AAAAAAAAPrk/QGqjmfdDreQ/s320/Against_the_Wall_Seriesdanko.jpg' alt='Capitulos de: Against The Wall' height='184' width='120'></a><br><div style='text-align:center;line-height:20px;height:20px;'><a href='/serie/20/against-the-wall.html' style='font-size: 11px;'> Against The Wall</a></div><br><br>

    patron = "<img class='ict' src='([^']+)'.*?<div style='text-align:center;line-height:20px;height:20px;'><a href='([^']+)' style='font-size: 11px;'>([^<]+)</a>"

    matches = re.compile(patron,re.DOTALL).findall(data)

    for scrapedthumbnail, scrapedurl, scrapedtitle in matches:
        
        
        itemlist.append( Item(channel=__channel__, title =scrapedtitle , url=urlparse.urljoin(host,scrapedurl), action="episodios", thumbnail=scrapedthumbnail, fanart ="http://portfolio.vernier.se/files/2014/03/light-grey-wood-photography-hd-wallpaper-1920x1200-46471.jpg", show=scrapedtitle) )

    try:
        return itemlist
    # Se captura la excepción, para no interrumpir al buscador global si un canal falla
    except:
        import sys
        for line in sys.exc_info():
            logger.error( "%s" % line )
        return []
开发者ID:superberny70,项目名称:plugin.video.pelisalacarta-3-9X,代码行数:29,代码来源:seriesblanco.py


示例20: findVideoFrameLink

def findVideoFrameLink(page, data):

    minheight = 300
    minwidth = 300

    frames = findFrames(data)
    if not frames:
        return None

    iframes = regexUtils.findall(
        data,
        "(frame(?![^>]*cbox\.ws)(?![^>]*Publi)(?![^>]*chat\d*\.\w+)(?![^>]*ad122m)(?![^>]*adshell)(?![^>]*capacanal)(?![^>]*blacktvlive\.com)[^>]*\sheight\s*=\s*[\"']*([\%\d]+)(?:px)?[\"']*[^>]*>)",
    )

    if iframes:
        for iframe in iframes:
            if iframe[1] == "100%":
                height = minheight + 1
            else:
                height = int(iframe[1])
            if height > minheight:
                m = regexUtils.findall(iframe[0], "[\"' ]width\s*=\s*[\"']*(\d+[%]*)(?:px)?[\"']*")
                if m:
                    if m[0] == "100%":
                        width = minwidth + 1
                    else:
                        width = int(m[0])
                    if width > minwidth:
                        m = regexUtils.findall(iframe[0], "['\"\s]src=[\"']*\s*([^>\"' ]+)\s*[>\"']*")
                        if m:
                            return urlparse.urljoin(urllib.unquote(page), m[0]).strip()

    # Alternative 1
    iframes = regexUtils.findall(
        data, '(frame(?![^>]*cbox\.ws)(?![^>]*capacanal)(?![^>]*blacktvlive\.com)[^>]*["; ]height:\s*(\d+)[^>]*>)'
    )
    if iframes:
        for iframe in iframes:
            height = int(iframe[1])
            if height > minheight:
                m = regexUtils.findall(iframe[0], '["; ]width:\s*(\d+)')
                if m:
                    width = int(m[0])
                    if width > minwidth:
                        m = regexUtils.findall(iframe[0], '["; ]src=["\']*\s*([^>"\' ]+)\s*[>"\']*')
                        if m:
                            return urlparse.urljoin(urllib.unquote(page), m[0]).strip()

    # Alternative 2 (Frameset)
    m = regexUtils.findall(data, '<FRAMESET[^>]+100%[^>]+>\s*<FRAME[^>]+src="([^"]+)"')
    if m:
        return urlparse.urljoin(urllib.unquote(page), m[0]).strip()

    m = regexUtils.findall(
        data, '<a href="([^"]+)" target="_blank"><img src="[^"]+" height="450" width="600" longdesc="[^"]+"/></a>'
    )
    if m:
        return urlparse.urljoin(urllib.unquote(page), m[0]).strip()

    return None
开发者ID:BirdsofaFeather,项目名称:husham.com,代码行数:60,代码来源:scrapingUtils.py



注:本文中的urlparse.urljoin函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python urlparse.urlparse函数代码示例发布时间:2022-05-27
下一篇:
Python urlparse.urldefrag函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap