本文整理汇总了Python中urlparse.urljoin函数的典型用法代码示例。如果您正苦于以下问题:Python urljoin函数的具体用法?Python urljoin怎么用?Python urljoin使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了urljoin函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: get_sub
def get_sub(self):
"""Fetches the subtitles from addic7ed from url specified in given database (db) for that episode"""
url_split = urlparse.urlsplit (self.url)
head, tail = url_split.path.rsplit ('/', 1)
new_path = head, 'addic7ed'
referer = urlparse.urlunsplit(url_split._replace(path=urlparse.urljoin(*new_path)))
domain = self.url
response = urllib2.urlopen(domain)#Opens the url
html = response.read ()#loads the html code
soup = BeautifulSoup (html)#interprets (parse?) the html code
links = []
for x in soup.find_all (class_ ="buttonDownload"):
links.append (x.attrs['href'])
domain = 'http://www.addic7ed.com/'
urls = []
for link in links:
urls.append (urlparse.urljoin (domain, link))
page = urls[0]
req = urllib2.Request(page, headers ={'User-Agent' : 'Mozilla 5.10', 'Referer' : referer})
response = urllib2.urlopen (req)
data = response.read()
test = response.info()
print test
if response.info().has_key('Content-Disposition'):
with open(os.path.join(self.db.env.subs_dir ,'%s.srt' % self.title), 'wb') as f:
f.write(data)
else:
return response.info()
开发者ID:urucaro,项目名称:edu,代码行数:33,代码来源:__init__.py
示例2: handle_captcha
def handle_captcha(self, response, solver):
sel = scrapy.Selector(response)
iframe_src = sel.xpath(self.CAPTCHA_XPATH).extract()[0]
iframe_url = urljoin(response.url, iframe_src)
iframe_request = scrapy.Request(iframe_url)
iframe_response = yield download(self.crawler, iframe_request)
iframe_sel = scrapy.Selector(iframe_response)
img_src, = iframe_sel.xpath('//img/@src').extract()[:1] or [None]
if img_src is None:
raise DecaptchaError('No //img/@src found on CAPTCHA page')
img_url = urljoin(iframe_response.url, img_src)
img_request = scrapy.Request(img_url)
img_response = yield download(self.crawler, img_request)
scrapy.log.msg('CAPTCHA image downloaded, solving')
captcha_text = yield solver.solve(img_response.body)
scrapy.log.msg('CAPTCHA solved: %s' % captcha_text)
challenge_request = scrapy.FormRequest.from_response(
iframe_response, formxpath='//form',
formdata={'recaptcha_response_field': captcha_text}
)
challenge_response = yield download(self.crawler, challenge_request)
challenge_sel = scrapy.Selector(challenge_response)
challenge, = challenge_sel.xpath(
'//textarea/text()'
).extract()[:1] or [None]
if not challenge:
raise DecaptchaError('Bad challenge from reCAPTCHA API:\n%s' %
challenge_response.body)
scrapy.log.msg('CAPTCHA solved, submitting challenge')
submit_request = scrapy.FormRequest.from_response(
response, formxpath='//form[.%s]' % self.CAPTCHA_XPATH,
formdata={'recaptcha_challenge_field': challenge}
)
yield download(self.crawler, submit_request)
开发者ID:JonathanBowker,项目名称:decaptcha,代码行数:34,代码来源:recaptcha.py
示例3: sources
def sources(self, url, hostDict, hostprDict):
try:
sources = []
if url == None: return sources
url = urlparse.urljoin(self.base_link, url)
for i in range(3):
result = client.request(url, timeout=10)
if not result == None: break
dom = dom_parser.parse_dom(result, 'div', attrs={'class':'links', 'id': 'noSubs'})
result = dom[0].content
links = re.compile('<tr\s*>\s*<td><i\s+class="fa fa-youtube link-logo"></i>([^<]+).*?href="([^"]+)"\s+class="watch',re.DOTALL).findall(result)
for link in links[:5]:
try:
url2 = urlparse.urljoin(self.base_link, link[1])
for i in range(2):
result2 = client.request(url2, timeout=3)
if not result2 == None: break
r = re.compile('href="([^"]+)"\s+class="action-btn').findall(result2)[0]
valid, hoster = source_utils.is_host_valid(r, hostDict)
if not valid: continue
urls, host, direct = source_utils.check_directstreams(r, hoster)
for x in urls: sources.append({'source': host, 'quality': x['quality'], 'language': 'en', 'url': x['url'], 'direct': direct, 'debridonly': False})
except:
#traceback.print_exc()
pass
return sources
except:
return sources
开发者ID:vphuc81,项目名称:MyRepository,代码行数:33,代码来源:seriesfree.py
示例4: novedades
def novedades(item):
logger.info("[serieonline.py] novedades")
# Descarga la página
data = scrapertools.cachePage(item.url)
# Extrae las entradas
patronvideos = '<a href="([^"]+)" title="([^"]+)"><img src="([^"]+)" alt="([^"]+)" class="captify" /></a>'
matches = re.compile(patronvideos,re.DOTALL).findall(data)
if DEBUG: scrapertools.printMatches(matches)
itemlist = []
for match in matches:
scrapedtitle = match[1] + " " + match[3]
scrapedplot = ""
scrapedurl = urlparse.urljoin(item.url,match[0])
scrapedthumbnail = urlparse.urljoin(item.url,match[2])
if (DEBUG): logger.info("title=["+scrapedtitle+"], url=["+scrapedurl+"], thumbnail=["+scrapedthumbnail+"]")
# Añade al listado de XBMC
itemlist.append( Item(channel=CHANNELNAME, action="findvideos", title=scrapedtitle , url=scrapedurl , thumbnail=scrapedthumbnail , plot=scrapedplot , folder=True) )
# Extrae el paginador
patronvideos = '<div class="paginacion-num"><a href="([^"]+)">'
matches = re.compile(patronvideos,re.DOTALL).findall(data)
scrapertools.printMatches(matches)
if len(matches)>0:
scrapedtitle = "Página siguiente"
scrapedurl = urlparse.urljoin(item.url,matches[0])
itemlist.append( Item(channel=CHANNELNAME, action="novedades", title=scrapedtitle , url=scrapedurl , folder=True) )
return itemlist
开发者ID:jorik041,项目名称:pelisalacarta-personal-fork,代码行数:34,代码来源:serieonline.py
示例5: getL10nRepositories
def getL10nRepositories(changesets, l10nRepoPath, relbranch=None):
"""Parses a list of locale names and revisions for their associated
repository from the 'changesets' string passed in."""
# urljoin() will strip the last part of l10nRepoPath it doesn't end with
# "/"
if not l10nRepoPath.endswith('/'):
l10nRepoPath = l10nRepoPath + '/'
repositories = {}
try:
for locale, data in json.loads(changesets).iteritems():
locale = urljoin(l10nRepoPath, locale)
repositories[locale] = {
'revision': data['revision'],
'relbranchOverride': relbranch,
'bumpFiles': []
}
except (TypeError, ValueError):
for locale, revision in parsePlainL10nChangesets(changesets).iteritems():
if revision == 'FIXME':
raise Exception('Found FIXME in changesets for locale "%s"' % locale)
locale = urljoin(l10nRepoPath, locale)
repositories[locale] = {
'revision': revision,
'relbranchOverride': relbranch,
'bumpFiles': []
}
return repositories
开发者ID:MihaiTabara,项目名称:build-tools,代码行数:28,代码来源:l10n.py
示例6: __init__
def __init__(self, layer, mapfile, fonts=None):
""" Initialize Mapnik provider with layer and mapfile.
XML mapfile keyword arg comes from TileStache config,
and is an absolute path by the time it gets here.
"""
maphref = urljoin(layer.config.dirpath, mapfile)
scheme, h, path, q, p, f = urlparse(maphref)
if scheme in ('file', ''):
self.mapfile = path
else:
self.mapfile = maphref
self.layer = layer
self.mapnik = None
engine = mapnik.FontEngine.instance()
if fonts:
fontshref = urljoin(layer.config.dirpath, fonts)
scheme, h, path, q, p, f = urlparse(fontshref)
if scheme not in ('file', ''):
raise Exception('Fonts from "%s" can\'t be used by Mapnik' % fontshref)
for font in glob(path.rstrip('/') + '/*.ttf'):
engine.register_font(str(font))
开发者ID:Outdooractive,项目名称:TileStache,代码行数:28,代码来源:Mapnik.py
示例7: processJob
def processJob(jobDetails):
try:
job = {}
url = urljoin(rootUrl, jobDetails.a['href'])
soup = thisInstitution.getSoup(url)
subLinks = soup.select('.pinkbox_heading a')
if subLinks:
for link in subLinks:
job['url'] = urljoin(rootUrl, link['href'])
job['title'] = link.get_text()
print job['title']
job["language"] = 'de'
jobPage = thisInstitution.getSoup(job['url'])
content = jobPage.find(id='contentblock')
job['text'] = unicode(content)
thisInstitution.addRecord(job)
else:
job['url'] = url
job['title'] = jobDetails.a.get_text()
print job['title']
job["language"] = 'de'
content = soup.find(id='contentblock')
job['text'] = unicode(content)
thisInstitution.addRecord(job)
except Exception as e:
print e
# record the error with the shared code and continue on to the next url
thisInstitution.error(e.message, job)
return False
开发者ID:ayat-ra,项目名称:scraping,代码行数:29,代码来源:helmutSchmidtUniversity.py
示例8: parse
def parse(self, response):
self._logger.info("start response in parse -> response type:%s"%type(response).__name__)
item_urls = [
urljoin(response.url, x) for x in list(set(
response.xpath('//div[@id="resultsCol"]//div[@class="a-row a-spacing-none"]/a[@class="a-link-normal a-text-normal"]/@href').extract()
))
]
self.crawler.stats.inc_total_pages(response.meta['crawlid'], response.meta['spiderid'], response.meta['appid'], len(item_urls))
for item_url in item_urls:
yield Request(url=item_url,
callback=self.parse_item,
meta=response.meta)
workers = response.meta.get('workers', {})
for worker in workers.keys():
workers[worker] = 0
if "if_next_page" in response.meta: del response.meta["if_next_page"]
next_page_urls = [
urljoin(response.url, x) for x in list(set(
response.xpath('//div[@id="pagn"]//span[@class="pagnRA"]/a/@href').extract()
))
]
response.meta["if_next_page"] = True
for next_page_url in next_page_urls:
yield Request(url=next_page_url,
callback=self.parse,
meta=response.meta)
开发者ID:mtaziz,项目名称:jaycluster,代码行数:26,代码来源:amazon_spider.py
示例9: __search
def __search(self, titles, type, year, season=0, episode=False):
try:
years = [str(year), str(int(year) + 1), str(int(year) - 1)]
years = ['&veroeffentlichung[]=%s' % i for i in years]
query = self.search_link % (type, urllib.quote_plus(cleantitle.query(titles[0])))
query += ''.join(years)
query = urlparse.urljoin(self.base_link, query)
t = [cleantitle.get(i) for i in set(titles) if i]
r = self.__proceed_search(query)
r = [i[0] for i in r if cleantitle.get(i[1]) in t and int(i[2]) == int(season)][0]
url = source_utils.strip_domain(r)
if episode:
r = client.request(urlparse.urljoin(self.base_link, url))
r = dom_parser.parse_dom(r, 'div', attrs={'class': 'season-list'})
r = dom_parser.parse_dom(r, 'li')
r = dom_parser.parse_dom(r, 'a', req='href')
r = [i.attrs['href'] for i in r if i and int(i.content) == int(episode)][0]
url = source_utils.strip_domain(r)
return url
except:
return
开发者ID:azumimuo,项目名称:family-xbmc-addon,代码行数:26,代码来源:meinkino.py
示例10: mainlist
def mainlist(item):
logger.info()
thumb_series = get_thumb("squares", "thumb_canales_series.png")
thumb_series_az = get_thumb("squares", "thumb_canales_series_az.png")
thumb_buscar = get_thumb("squares", "thumb_buscar.png")
itemlist = []
itemlist.append(Item(channel=item.channel, title="Listado alfabético", action="series_listado_alfabetico",
thumbnail=thumb_series_az))
itemlist.append(Item(channel=item.channel, title="Todas las series", action="series",
url=urlparse.urljoin(HOST, "listado/"), thumbnail=thumb_series))
itemlist.append(Item(channel=item.channel, title="Capítulos de estreno", action="homeSection", extra=CAPITULOS_DE_ESTRENO_STR,
url=HOST , thumbnail=thumb_series))
itemlist.append(Item(channel=item.channel, title="Último actualizado", action="homeSection", extra="Último Actualizado",
url=HOST , thumbnail=thumb_series))
itemlist.append(Item(channel=item.channel, title="Series más vistas", action="homeSection", extra="Series Más vistas",
url=HOST , thumbnail=thumb_series))
itemlist.append(Item(channel=item.channel, title="Series menos vistas", action="homeSection", extra="Series Menos vistas",
url=HOST , thumbnail=thumb_series))
itemlist.append(Item(channel=item.channel, title="Últimas fichas creadas", action="series",
url=urlparse.urljoin(HOST, "fichas_creadas/"), thumbnail=thumb_series))
itemlist.append(Item(channel=item.channel, title="Buscar...", action="search", url=HOST, thumbnail=thumb_buscar))
if filtertools.context:
itemlist = filtertools.show_option(itemlist, item.channel, list_idiomas, CALIDADES)
return itemlist
开发者ID:neno1978,项目名称:pelisalacarta,代码行数:29,代码来源:seriesblanco.py
示例11: parseImgLinks
def parseImgLinks(self,depth=1):
url_response = None
try:
url_response = urllib2.urlopen(self.scrap_url,timeout=self._timeout)
except Exception as e:
print(" [ERROR]: Could not open {0}: {1}".format(self.scrap_url,e.reason))
return self.img_list
html_parse = BeautifulSoup(url_response)
unique_images_found = 0
total_images_found = 0
self.visited[self.scrap_url] = 1
for img in html_parse.findAll('img'):
try:
abs_url = urljoin(self.scrap_url,img['src']) if urlparse(img['src']).netloc == "" else img['src']
if abs_url not in self.img_list:
self.img_list.add(abs_url)
unique_images_found += 1
total_images_found += 1
except:
pass
print(" [Found %d images / %d new]: %s" % (total_images_found,unique_images_found,self.scrap_url))
if depth > 1:
for a in html_parse.findAll('a'):
try:
if (urlparse(a['href']).netloc == "") or (urlparse(self.scrape_url_orig).netloc == urlparse(a['href']).netloc):
self.scrap_url = urljoin(self.scrape_url_orig,a['href'])
if self.scrap_url in self.visited: continue
self.parseImgLinks(depth - 1)
except:
pass
return self.img_list
开发者ID:erktheerk,项目名称:image-scraper,代码行数:34,代码来源:__init__.py
示例12: episodios
def episodios(item):
logger.info("{0} - {1}".format(item.title, item.url))
itemlist = []
# Descarga la página
data = scrapertools.cache_page(item.url)
fanart = scrapertools.find_single_match(data, "background-image[^'\"]+['\"]([^'\"]+)")
plot = scrapertools.find_single_match(data, "id=['\"]profile2['\"]>\s*(.*?)\s*</div>")
logger.debug("fanart: {0}".format(fanart))
logger.debug("plot: {0}".format(plot))
episodes = re.findall("<tr.*?href=['\"](?P<url>[^'\"]+).+?>(?P<title>.+?)</a>.*?<td>(?P<flags>.*?)</td>", data, re.MULTILINE | re.DOTALL)
for url, title, flags in episodes:
idiomas = " ".join(["[{0}]".format(IDIOMAS.get(language, "OVOS")) for language in re.findall("banderas/([^\.]+)", flags, re.MULTILINE)])
displayTitle = "{show} - {title} {languages}".format(show = item.show, title = title, languages = idiomas)
logger.debug("Episode found {0}: {1}".format(displayTitle, urlparse.urljoin(HOST, url)))
itemlist.append(item.clone(title=displayTitle, url=urlparse.urljoin(HOST, url),
action="findvideos", plot=plot, fanart=fanart, language=idiomas,
list_idiomas=list_idiomas, list_calidad=CALIDADES, context=filtertools.context))
if len(itemlist) > 0 and filtertools.context:
itemlist = filtertools.get_links(itemlist, item.channel)
if config.get_library_support() and len(itemlist) > 0:
itemlist.append(item.clone(title="Añadir esta serie a la biblioteca", action="add_serie_to_library", extra="episodios"))
return itemlist
开发者ID:neno1978,项目名称:pelisalacarta,代码行数:31,代码来源:seriesblanco.py
示例13: choose_reference
def choose_reference(experiment, biorep_n, server, keypair, sex_specific):
replicates = [common.encoded_get(urlparse.urljoin(server,rep_uri), keypair, frame='embedded') for rep_uri in experiment['replicates']]
replicate = next(rep for rep in replicates if rep.get('biological_replicate_number') == biorep_n)
logging.debug('Replicate uuid %s' %(replicate.get('uuid')))
organism_uri = replicate.get('library').get('biosample').get('organism')
organism_obj = common.encoded_get(urlparse.urljoin(server,organism_uri), keypair)
try:
organism_name = organism_obj['name']
except:
logging.error('%s:rep%d Cannot determine organism.' %(experiment.get('accession'), biorep_n))
raise
return None
else:
logging.debug("Organism name %s" %(organism_name))
if sex_specific:
try:
sex = replicate.get('library').get('biosample').get('sex')
assert sex in ['male', 'female']
except:
logging.warning('%s:rep%d Sex is %s. Mapping to male reference.' %(experiment.get('accession'), biorep_n, sex))
sex = 'male'
logging.debug('Organism %s sex %s' %(organism_name, sex))
else:
sex = 'male'
genome_assembly = args.assembly
reference = next((ref.get('file') for ref in REFERENCES if ref.get('organism') == organism_name and ref.get('sex') == sex and ref.get('assembly') == genome_assembly), None)
logging.debug('Found reference %s' %(reference))
return reference
开发者ID:anwesharry,项目名称:chip-seq-pipeline,代码行数:34,代码来源:map_only.py
示例14: check_page
def check_page(self, page):
self.marionette.navigate(urlparse.urljoin(self.server_prefix, page))
try:
self.marionette.find_element("id", 'complete')
except NoSuchElementException:
fullPageUrl = urlparse.urljoin(self.relPath, page)
details = "%s: 1 failure encountered\n%s" % \
(fullPageUrl,
self.get_failure_summary(
fullPageUrl, "Waiting for Completion",
"Could not find the test complete indicator"))
raise AssertionError(details)
fail_node = self.marionette.find_element("css selector",
'.failures > em')
if fail_node.text == "0":
return
# This may want to be in a more general place triggerable by an env
# var some day if it ends up being something we need often:
#
# If you have browser-based unit tests which work when loaded manually
# but not from marionette, uncomment the two lines below to break
# on failing tests, so that the browsers won't be torn down, and you
# can use the browser debugging facilities to see what's going on.
#from ipdb import set_trace
#set_trace()
raise AssertionError(self.get_failure_details(page))
开发者ID:AtulKumar2,项目名称:gecko-dev,代码行数:32,代码来源:frontend_tester.py
示例15: as_obi_serialization
def as_obi_serialization(self, request=None):
"""Produce an Open Badge Infrastructure serialization of this badge"""
if request:
base_url = request.build_absolute_uri('/')
else:
base_url = 'http://%s' % (Site.objects.get_current().domain,)
# see: https://github.com/brianlovesdata/openbadges/wiki/Assertions
if not self.creator:
issuer = SITE_ISSUER
else:
issuer = {
# TODO: Get from user profile instead?
"origin": urljoin(base_url, self.creator.get_absolute_url()),
"name": self.creator.username,
"contact": self.creator.email
}
data = {
# The version of the spec/hub this manifest is compatible with. Use
# "0.5.0" for the beta.
"version": OBI_VERSION,
# TODO: truncate more intelligently
"name": self.title[:128],
# TODO: truncate more intelligently
"description": self.description[:128],
"criteria": urljoin(base_url, self.get_absolute_url()),
"issuer": issuer
}
if self.image:
data['image'] = urljoin(base_url, self.image.url)
return data
开发者ID:johnsca,项目名称:django-badger,代码行数:32,代码来源:models.py
示例16: get_sources
def get_sources(self, url, hosthdDict, hostDict, locDict):
try:
sources = []
if url == None: return sources
url = urlparse.urljoin(self.base_link, url)
result = client.source(url)
video_id = re.compile('video_id *= *[\'|\"](.+?)[\'|\"]').findall(result)[0]
post = urllib.urlencode({'video_id': video_id})
result = client.source(urlparse.urljoin(self.base_link, self.info_link), post=post)
u = [i for i in result.split('&') if 'google' in i][0]
u = urllib.unquote_plus(u)
u = [urllib.unquote_plus(i.split('|')[-1]) for i in u.split(',')]
u = [googleplus.tag(i)[0] for i in u]
u = [i for i in u if i['quality'] in ['1080p', 'HD']]
for i in u: sources.append({'source': 'GVideo', 'quality': i['quality'], 'provider': 'Afdah', 'url': i['url']})
return sources
except:
return sources
开发者ID:marnnie,项目名称:Cable-buenaventura,代码行数:26,代码来源:afdah_mv.py
示例17: get_sources
def get_sources(self, video):
source_url = self.get_url(video)
hosters = []
if source_url and source_url != FORCE_NO_MATCH:
url = urlparse.urljoin(self.base_url, source_url)
page_html = self._http_get(url, cache_limit=.5)
movie_id = dom_parser.parse_dom(page_html, 'div', {'id': 'media-player'}, 'movie-id')
if movie_id:
server_url = SL_URL % (movie_id[0])
url = urlparse.urljoin(self.base_url, server_url)
html = self._http_get(url, cache_limit=.5)
sources = {}
for match in re.finditer('changeServer\(\s*(\d+)\s*,\s*(\d+)\s*\).*?class="btn-eps[^>]*>([^<]+)', html, re.DOTALL):
link_type, link_id, q_str = match.groups()
if link_type in ['12', '13', '14']:
url = urlparse.urljoin(self.base_url, PLAYLIST_URL1 % (link_id))
sources.update(self.__get_link_from_json(url, q_str))
else:
media_url = self.__get_ep_pl_url(link_type, page_html)
if media_url:
url = urlparse.urljoin(self.base_url, media_url)
xml = self._http_get(url, cache_limit=.5)
sources.update(self.__get_links_from_xml(xml, video))
for source in sources:
if sources[source]['direct']:
host = self._get_direct_hostname(source)
else:
host = urlparse.urlparse(source).hostname
hoster = {'multi-part': False, 'host': host, 'class': self, 'quality': sources[source]['quality'], 'views': None, 'rating': None, 'url': source, 'direct': sources[source]['direct']}
hosters.append(hoster)
return hosters
开发者ID:lidormalicb,项目名称:salts,代码行数:32,代码来源:123movies_scraper.py
示例18: get_sources
def get_sources(self, video):
source_url = self.get_url(video)
hosters = []
if source_url and source_url != FORCE_NO_MATCH:
page_url = urlparse.urljoin(self.base_url, source_url)
html = self._http_get(page_url, cache_limit=.25)
match = re.search('''<option[^>]+value\s*=\s*["']([^"']+)[^>]*>(?:Altyaz.{1,3}s.{1,3}z)<''', html)
if match:
option_url = urlparse.urljoin(self.base_url, match.group(1))
html = self._http_get(option_url, cache_limit=.25)
fragment = dom_parser.parse_dom(html, 'span', {'class': 'object-wrapper'})
if fragment:
iframe_url = dom_parser.parse_dom(fragment[0], 'iframe', ret='src')
if iframe_url:
html = self._http_get(iframe_url[0], cache_limit=.25)
seen_urls = {}
for match in re.finditer('"?file"?\s*:\s*"([^"]+)"\s*,\s*"?label"?\s*:\s*"(\d+)p?[^"]*"', html):
stream_url, height = match.groups()
if stream_url not in seen_urls:
seen_urls[stream_url] = True
stream_url += '|User-Agent=%s' % (scraper_utils.get_ua())
host = self._get_direct_hostname(stream_url)
if host == 'gvideo':
quality = scraper_utils.gv_get_quality(stream_url)
else:
quality = scraper_utils.height_get_quality(height)
hoster = {'multi-part': False, 'host': self._get_direct_hostname(stream_url), 'class': self, 'quality': quality, 'views': None, 'rating': None, 'url': stream_url, 'direct': True}
hosters.append(hoster)
return hosters
开发者ID:c0ns0le,项目名称:YCBuilds,代码行数:31,代码来源:dizibox_scraper.py
示例19: search
def search(item,texto):
logger.info("[pelisalacarta.seriesblanco search texto="+texto)
itemlist = []
item.url = urlparse.urljoin(host,"/search.php?q1=%s" % (texto))
data = scrapertools.cache_page(item.url)
data = re.sub(r"\n|\r|\t|\s{2}| |<Br>|<BR>|<br>|<br/>|<br />|-\s","",data)
data = re.sub(r"<!--.*?-->","",data)
#<div style='float:left;width: 620px;'><div style='float:left;width: 33%;text-align:center;'><a href='/serie/20/against-the-wall.html' '><img class='ict' src='http://4.bp.blogspot.com/-LBERI18Cq-g/UTendDO7iNI/AAAAAAAAPrk/QGqjmfdDreQ/s320/Against_the_Wall_Seriesdanko.jpg' alt='Capitulos de: Against The Wall' height='184' width='120'></a><br><div style='text-align:center;line-height:20px;height:20px;'><a href='/serie/20/against-the-wall.html' style='font-size: 11px;'> Against The Wall</a></div><br><br>
patron = "<img class='ict' src='([^']+)'.*?<div style='text-align:center;line-height:20px;height:20px;'><a href='([^']+)' style='font-size: 11px;'>([^<]+)</a>"
matches = re.compile(patron,re.DOTALL).findall(data)
for scrapedthumbnail, scrapedurl, scrapedtitle in matches:
itemlist.append( Item(channel=__channel__, title =scrapedtitle , url=urlparse.urljoin(host,scrapedurl), action="episodios", thumbnail=scrapedthumbnail, fanart ="http://portfolio.vernier.se/files/2014/03/light-grey-wood-photography-hd-wallpaper-1920x1200-46471.jpg", show=scrapedtitle) )
try:
return itemlist
# Se captura la excepción, para no interrumpir al buscador global si un canal falla
except:
import sys
for line in sys.exc_info():
logger.error( "%s" % line )
return []
开发者ID:superberny70,项目名称:plugin.video.pelisalacarta-3-9X,代码行数:29,代码来源:seriesblanco.py
示例20: findVideoFrameLink
def findVideoFrameLink(page, data):
minheight = 300
minwidth = 300
frames = findFrames(data)
if not frames:
return None
iframes = regexUtils.findall(
data,
"(frame(?![^>]*cbox\.ws)(?![^>]*Publi)(?![^>]*chat\d*\.\w+)(?![^>]*ad122m)(?![^>]*adshell)(?![^>]*capacanal)(?![^>]*blacktvlive\.com)[^>]*\sheight\s*=\s*[\"']*([\%\d]+)(?:px)?[\"']*[^>]*>)",
)
if iframes:
for iframe in iframes:
if iframe[1] == "100%":
height = minheight + 1
else:
height = int(iframe[1])
if height > minheight:
m = regexUtils.findall(iframe[0], "[\"' ]width\s*=\s*[\"']*(\d+[%]*)(?:px)?[\"']*")
if m:
if m[0] == "100%":
width = minwidth + 1
else:
width = int(m[0])
if width > minwidth:
m = regexUtils.findall(iframe[0], "['\"\s]src=[\"']*\s*([^>\"' ]+)\s*[>\"']*")
if m:
return urlparse.urljoin(urllib.unquote(page), m[0]).strip()
# Alternative 1
iframes = regexUtils.findall(
data, '(frame(?![^>]*cbox\.ws)(?![^>]*capacanal)(?![^>]*blacktvlive\.com)[^>]*["; ]height:\s*(\d+)[^>]*>)'
)
if iframes:
for iframe in iframes:
height = int(iframe[1])
if height > minheight:
m = regexUtils.findall(iframe[0], '["; ]width:\s*(\d+)')
if m:
width = int(m[0])
if width > minwidth:
m = regexUtils.findall(iframe[0], '["; ]src=["\']*\s*([^>"\' ]+)\s*[>"\']*')
if m:
return urlparse.urljoin(urllib.unquote(page), m[0]).strip()
# Alternative 2 (Frameset)
m = regexUtils.findall(data, '<FRAMESET[^>]+100%[^>]+>\s*<FRAME[^>]+src="([^"]+)"')
if m:
return urlparse.urljoin(urllib.unquote(page), m[0]).strip()
m = regexUtils.findall(
data, '<a href="([^"]+)" target="_blank"><img src="[^"]+" height="450" width="600" longdesc="[^"]+"/></a>'
)
if m:
return urlparse.urljoin(urllib.unquote(page), m[0]).strip()
return None
开发者ID:BirdsofaFeather,项目名称:husham.com,代码行数:60,代码来源:scrapingUtils.py
注:本文中的urlparse.urljoin函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论