本文整理汇总了Python中w3lib.url.urljoin_rfc函数的典型用法代码示例。如果您正苦于以下问题:Python urljoin_rfc函数的具体用法?Python urljoin_rfc怎么用?Python urljoin_rfc使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了urljoin_rfc函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: parse
def parse(self,response):
self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
if response.status / 100 != 2:
return
base_url = get_base_url(response)
for href in response.xpath('//table/tr/td/strong/a/@href').extract():
relative_url = href
abs_url =urljoin_rfc(base_url,relative_url)
yield self.baidu_rpc_request({"url":abs_url,"src_id":22},furl=response.url)
#self.log("Parse %s %s"%(abs_url,response.url),level=scrapy.log.INFO)
#yield scrapy.Request(url=abs_url,callback=self.parse)
#解析pdf
for href in response.xpath('//table[@class="object_table"]/tr/td[4]/a/@href').extract():
relative_url = href
abs_url =urljoin_rfc(base_url,relative_url)
yield self.baidu_rpc_request({"url":abs_url,"src_id":22},furl=response.url)
#self.log("Parse %s %s"%(abs_url,response.url),level=scrapy.log.INFO)
#yield scrapy.Request(url=abs_url,callback=self.parse)
#解析翻页
for href in response.xpath('//table/tr/td/table/tr/td/a/@href').extract():
if ("page=" not in href and "browse-date?top=" not in href ) or "itemsPerPage=" in href:
continue
relative_url = href
abs_url =urljoin_rfc(base_url,relative_url)
yield self.baidu_rpc_request({"url":abs_url,"src_id":22},furl=response.url)
#self.log("Parse %s %s"%(abs_url,response.url),level=scrapy.log.INFO)
yield scrapy.Request(url=abs_url,callback=self.parse)
开发者ID:muzichenglong,项目名称:scrapyc,代码行数:30,代码来源:handle.py
示例2: parse_index
def parse_index(self,response):
self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
if response.status / 100 != 2:
yield scrapy.Request(url=response.url,callback=self.parse_index)
return
base_url = get_base_url(response)
#解析期刊首页
count = 0
for href in response.xpath("//div[@id='divperilist']/ul/li/a/@href").extract():
if href.startswith("Rss.ashx?"):
continue
relative_url = href
abs_url =urljoin_rfc(base_url,relative_url)
#self.log("Parse %s %s"%(response.url,abs_url),level=scrapy.log.INFO)
yield self.baidu_rpc_request({"url":abs_url,"src_id":22},furl=response.url)
yield scrapy.Request(url=abs_url,callback=self.parse_content)
count += 1
self.log("Fuck %s %d"%(response.url,count),level=scrapy.log.INFO)
#解析索引页翻页
for href in response.xpath("//div[@id='divperilist']/table//a/@href").extract():
if "PageNo" not in href:
continue
relative_url = href
abs_url =urljoin_rfc(base_url,relative_url)
self.log("Parse %s %s"%(response.url,abs_url),level=scrapy.log.INFO)
yield scrapy.Request(url=abs_url,callback=self.parse_index)
开发者ID:muzichenglong,项目名称:scrapyc,代码行数:27,代码来源:wanfangdata.py
示例3: extract_links
def extract_links(self, response):
xs = HtmlXPathSelector(response)
base_url = xs.select('//base/@href').extract()
base_url = urljoin_rfc(response.url, base_url[0]) if base_url else response.url
links = []
for location in self.locations:
if isinstance(location, basestring):
selectors = xs.select(location)
elif isinstance(location, (XPathSelectorList, HtmlXPathSelector)):
selectors = [location] if isinstance(location, HtmlXPathSelector) else location
else:
continue
for selector in selectors:
links.extend(self.extract_from_selector(selector, response.encoding))
seen, ret = set(), []
for link in links:
link.url = urljoin_rfc(base_url, link.url, response.encoding)
if self.unique:
if link.url in seen:
continue
else:
seen.add(link.url)
if self.canonicalize:
link.url = canonicalize_url(link.url)
ret.append(link)
return ret
开发者ID:bihicheng,项目名称:scrapy,代码行数:30,代码来源:image.py
示例4: process_response
def process_response(self, request, response, spider):
if "dont_redirect" in request.meta:
return response
if request.method.upper() == "HEAD":
if response.status in [301, 302, 303, 307] and "Location" in response.headers:
redirected_url = urljoin_rfc(request.url, response.headers["location"])
redirected = request.replace(url=redirected_url)
return self._redirect(redirected, request, spider, response.status)
else:
return response
if response.status in [302, 303] and "Location" in response.headers:
redirected_url = urljoin_rfc(request.url, response.headers["location"])
redirected = self._redirect_request_using_get(request, redirected_url)
return self._redirect(redirected, request, spider, response.status)
if response.status in [301, 307] and "Location" in response.headers:
redirected_url = urljoin_rfc(request.url, response.headers["location"])
redirected = request.replace(url=redirected_url)
return self._redirect(redirected, request, spider, response.status)
if isinstance(response, HtmlResponse):
interval, url = get_meta_refresh(response)
if url and interval < self.max_metarefresh_delay:
redirected = self._redirect_request_using_get(request, url)
return self._redirect(redirected, request, spider, "meta refresh")
return response
开发者ID:saidimu,项目名称:scrapy,代码行数:28,代码来源:redirect.py
示例5: _extract_links
def _extract_links(self, response_text, response_url, response_encoding):
base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url
clean_url = lambda u: urljoin_rfc(base_url, remove_entities(clean_link(u.decode(response_encoding))))
clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip()
links_text = linkre.findall(response_text)
urlstext = set([(clean_url(url), clean_text(text)) for url, _, text in links_text])
return [Link(url, text) for url, text in urlstext]
开发者ID:bihicheng,项目名称:scrapy,代码行数:10,代码来源:regex.py
示例6: parse
def parse(self, response):
self.log("Crawled %s %d" % (response.url, response.status), level=scrapy.log.INFO)
# self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
if response.status / 100 != 2:
return
count = 0
for a in response.xpath('//a'):
text = a.xpath("string(.)").extract()
text = "".join(text).strip()
if len(text) > 5 or "PDF" not in text:
continue
href = a.xpath("@href").extract()
if len(href) != 1:
continue
href = href[0]
if (href == "#" or href.startswith("javascript")) and len(a.xpath("@onclick").extract()) == 1:
onclick = a.xpath("@onclick").extract()[0]
onclick = onclick.split(",")
if len(onclick) < 2:
continue
if onclick[0].startswith("showArticleFile"):
id = onclick[-1].split(")", 1)[0].replace("'", "")
else:
id = onclick[1].split(")", 1)[0].replace("'", "")
if "/CN/" in response.url:
pdf = response.url.split("/CN/", 1)[
0] + "/CN/article/downloadArticleFile.do?attachType=PDF&id=" + id
elif "/EN/" in response.url:
pdf = response.url.split("/EN/", 1)[
0] + "/EN/article/downloadArticleFile.do?attachType=PDF&id=" + id
else:
continue
elif "attachType=PDF&id=" in href:
abs_url = urljoin_rfc(response.url, href)
pdf = abs_url
else:
continue
# url = "http://www.zjnyxb.cn/CN/article/downloadArticleFile.do?attachType=PDF&id="+id
# print pdf
self.log("PDF_URL %s" % (pdf), level=scrapy.log.INFO)
yield self.baidu_rpc_request({"url": pdf, "src_id": 22})
count += 1
base_url = get_base_url(response)
for sel in response.xpath('//a/@href'):
relative_url = sel.extract().encode(response.encoding)
if relative_url.startswith("javascript:") or relative_url.startswith("mailto:") or relative_url == "#":
continue
abs_url = urljoin_rfc(base_url, relative_url)
abs_url = safe_url_string(abs_url, encoding=response.encoding)
yield self.baidu_rpc_request({"url": abs_url, "src_id": 22})
self.log("PDF_TOTAL %s %d" % (response.url, count), level=scrapy.log.INFO)
开发者ID:wjianwei126,项目名称:scrapyc,代码行数:53,代码来源:pdf.py
示例7: _extract_links
def _extract_links(self, response_text, response_url, response_encoding):
self.base_url, self.links = etree.HTML(response_text, self.parser)
links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links
ret = []
base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url
for link in links:
link.url = urljoin_rfc(base_url, link.url, response_encoding)
link.url = safe_url_string(link.url, response_encoding)
link.text = str_to_unicode(link.text, response_encoding, errors="replace")
ret.append(link)
return ret
开发者ID:netconstructor,项目名称:scrapy,代码行数:14,代码来源:lxmlparser.py
示例8: test_urljoin_rfc
def test_urljoin_rfc(self):
self.assertEqual(urljoin_rfc('http://example.com/some/path', 'newpath/test'),
'http://example.com/some/newpath/test')
self.assertEqual(urljoin_rfc('http://example.com/some/path/a.jpg', '../key/other'),
'http://example.com/some/key/other')
u = urljoin_rfc(u'http://example.com/lolo/\xa3/lele', u'lala/\xa3')
self.assertEqual(u, 'http://example.com/lolo/\xc2\xa3/lala/\xc2\xa3')
assert isinstance(u, str)
u = urljoin_rfc(u'http://example.com/lolo/\xa3/lele', 'lala/\xa3', encoding='latin-1')
self.assertEqual(u, 'http://example.com/lolo/\xa3/lala/\xa3')
assert isinstance(u, str)
u = urljoin_rfc('http://example.com/lolo/\xa3/lele', 'lala/\xa3')
self.assertEqual(u, 'http://example.com/lolo/\xa3/lala/\xa3')
assert isinstance(u, str)
开发者ID:LucianU,项目名称:w3lib,代码行数:14,代码来源:test_url.py
示例9: parse
def parse(self, response):
self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
#self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
if response.status / 100 != 2:
return
site = get_url_site(response.url)
if site in self.parses:
parser = self.parses[site]
#self.log("Parser %s %s"%(response.url,parser.name),level=scrapy.log.INFO)
for item in parser.parse(response) :
yield item
return
base_url = get_base_url(response)
for sel in response.xpath('//a/@href'):
relative_url = sel.extract()
abs_url =urljoin_rfc(base_url,relative_url)
#print abs_url
schema = get_url_scheme(abs_url)
if schema not in ["http","https"]:
continue
site = get_url_site(abs_url)
yield NimeiItem(url=abs_url,furl=response.url)
yield self.baidu_rpc_request({"url":abs_url,"src_id":4})
开发者ID:muzichenglong,项目名称:scrapyc,代码行数:27,代码来源:base.py
示例10: parse
def parse(self, response):
self.log("Crawled %s %d" % (response.url, response.status), level=scrapy.log.INFO)
# self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
if response.status / 100 != 2:
# self.log(response.headers,level=scrapy.log.INFO)
yield scrapy.Request(response.url)
return
if response.__class__ != scrapy.http.HtmlResponse:
return
base_site = get_url_site(response.url)
# print response.url,response.status
base_url = response.url
for sel in response.xpath('//a/@href'):
relative_url = sel.extract()
if not self.is_valid_url(relative_url):
continue
abs_url = urljoin_rfc(base_url, relative_url)
# print abs_url
schema = get_url_scheme(abs_url)
if schema not in ["http", "https"]:
continue
site = get_url_site(abs_url)
# yield NimeiItem(url=abs_url,furl=response.url)
yield self.baidu_rpc_request({"url": abs_url, "src_id": 22}, furl=response.url)
if site != base_site and site not in self.settings.get("ALLOW_SITES", []):
continue
self.log("SendCrawl %s" % (abs_url), level=scrapy.log.INFO)
yield scrapy.Request(abs_url)
开发者ID:wjianwei126,项目名称:scrapyc,代码行数:30,代码来源:base.py
示例11: parse_all
def parse_all(self, response):
self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
#self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
if response.status / 100 != 2:
return
base_url = get_base_url(response)
base_site = get_url_site(base_url)
for sel in response.xpath('//a/@href'):
relative_url = sel.extract().encode(response.encoding)
if relative_url.startswith("javascript:") or relative_url.startswith("mailto:") or relative_url=="#":
continue
abs_url = urljoin_rfc(base_url,relative_url)
abs_url = safe_url_string(abs_url,encoding=response.encoding)
filename = abs_url.split("?")[0].split("/")[-1]
if filename :
ctype = filename.split(".")[-1].lower()
else:
ctype = None
if ctype in ["jpeg","jpg","swf","rar","zip","gz","gif","mov","png","bmp","exe","pps","db","txt","pptx",'xls',"ppt","xlsx"]:
continue
yield self.baidu_rpc_request({"url":abs_url,"src_id":22})
site = get_url_site(abs_url)
if site != base_site:
continue
if ctype in ["pdf","doc","docx","rtf",]:
continue
yield scrapy.Request(url=abs_url,callback=self.parse_all)
开发者ID:muzichenglong,项目名称:scrapyc,代码行数:31,代码来源:pdf.py
示例12: parse
def parse(self,response):
self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
#self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
if response.status / 100 != 2:
return
base_url = get_base_url(response)
for sel in response.xpath('//a/@href'):
relative_url = sel.extract()
if relative_url.startswith("javascript:"):
continue
if "mod=redirect" in relative_url or "redirect.php" in relative_url:
continue
abs_url =urljoin_rfc(base_url,relative_url)
schema = get_url_scheme(abs_url)
if schema not in ["http","https"]:
continue
#yield NimeiItem(url=abs_url,furl=response.url)
abs_url = self.remove_param(abs_url,["extra","orderby","typeid","filter","sortid","searchsort","vk_payway_13","sid","recommend","digest"])
if self.PATTERN1.match(abs_url):
abs_url = re.sub("\-\d+\-\d+\.html.*","-1-1.html",abs_url,1)
yield self.baidu_rpc_request({"url":abs_url,"src_id":4})
if relative_url.startswith("forum_") or relative_url.startswith("forum-") or relative_url.startswith("/archives/") or relative_url.startswith("forumdisplay.php?fid=") or relative_url.startswith("forum.php?mod=forumdisplay&fid="):
yield scrapy.Request(abs_url)
开发者ID:muzichenglong,项目名称:scrapyc,代码行数:29,代码来源:bbs.py
示例13: _extract_links
def _extract_links(self, response_text, response_url, response_encoding):
self.reset()
self.feed(response_text)
self.close()
links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links
ret = []
base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url
for link in links:
link.url = urljoin_rfc(base_url, link.url, response_encoding)
link.url = safe_url_string(link.url, response_encoding)
link.text = link.text.decode(response_encoding)
ret.append(link)
return ret
开发者ID:bihicheng,项目名称:scrapy,代码行数:16,代码来源:htmlparser.py
示例14: _extract_links
def _extract_links(self, response_text, response_url, response_encoding, base_url=None):
""" Do the real extraction work """
self.reset()
self.feed(response_text)
self.close()
ret = []
if base_url is None:
base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url
for link in self.links:
link.url = urljoin_rfc(base_url, link.url, response_encoding)
link.url = safe_url_string(link.url, response_encoding)
link.text = str_to_unicode(link.text, response_encoding, errors='replace')
ret.append(link)
return ret
开发者ID:bihicheng,项目名称:scrapy,代码行数:16,代码来源:sgml.py
示例15: parse_zgyszz
def parse_zgyszz(self,response):
self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
#self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
if response.status / 100 != 2:
return
#base_site = get_url_site(base_url)
if "qklist/show-" in response.url:
base_url = get_base_url(response)
downLink = response.xpath("//div[@id='down']//a/@onclick").extract()[0]
relative_url = downLink.split("'")[1]
abs_url = urljoin_rfc(base_url,relative_url)
yield scrapy.Request(abs_url,callback=self.parse_zgyszz)
yield self.baidu_rpc_request({"url":abs_url,"src_id":22})
return
if '/upload/qklist/' in response.url:
yield self.baidu_rpc_request({"url":response.url,"src_id":22})
return
base_url = response.url
for sel in response.xpath("//div[@class='main_box']//table/tr[1]/td/a/@href"):
relative_url = sel.extract().encode(response.encoding)
if relative_url.startswith("javascript:") or relative_url.startswith("mailto:") or relative_url=="#":
continue
abs_url = urljoin_rfc(base_url,relative_url)
abs_url = safe_url_string(abs_url,encoding=response.encoding)
request = scrapy.Request(abs_url,callback=self.parse_zgyszz)
#request.meta["dont_redirect"] = True
yield request
yield self.baidu_rpc_request({"url":abs_url,"src_id":22})
for sel in response.xpath("//div[@class='flickr']/a/@href"):
relative_url = sel.extract().encode(response.encoding)
if relative_url.startswith("javascript:") or relative_url.startswith("mailto:") or relative_url=="#":
continue
abs_url = urljoin_rfc(base_url,relative_url)
abs_url = safe_url_string(abs_url,encoding=response.encoding)
request = scrapy.Request(abs_url,callback=self.parse_zgyszz)
yield request
yield self.baidu_rpc_request({"url":abs_url,"src_id":22})
开发者ID:muzichenglong,项目名称:scrapyc,代码行数:44,代码来源:pdf.py
示例16: parse
def parse(self,response):
self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
#self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
if response.status / 100 != 2:
yield scrapy.Request(url=response.url)
return
base_url = get_base_url(response)
for href in response.xpath('//div[@class="center_bottom_list"]//a/@href').extract():
if not self.is_valid_url(href):
continue
relative_url = href
abs_url =urljoin_rfc(base_url,relative_url)
yield self.baidu_rpc_request({"url":abs_url,"src_id":22},response.url)
#翻页
for href in response.xpath('//div[@class="article_list_page"]//a/@href').extract():
abs_url =urljoin_rfc(base_url,href)
yield scrapy.Request(url=abs_url)
开发者ID:muzichenglong,项目名称:scrapyc,代码行数:19,代码来源:sciencemeta-net.py
示例17: get_base_url
def get_base_url(text, baseurl='', encoding='utf-8'):
"""Return the base url if declared in the given html text, relative to the
given base url. If no base url is found, the given base url is returned
"""
text = str_to_unicode(text, encoding)
baseurl = unicode_to_str(baseurl, encoding)
m = _baseurl_re.search(text)
if m:
baseurl = urljoin_rfc(baseurl, m.group(1).encode(encoding))
return safe_url_string(baseurl)
开发者ID:LucianU,项目名称:w3lib,代码行数:10,代码来源:html.py
示例18: _extract_requests
def _extract_requests(self, response_text, response_url, response_encoding):
"""Extract requests with absolute urls"""
self.reset()
self.feed(response_text)
self.close()
base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url
self._make_absolute_urls(base_url, response_encoding)
self._fix_link_text_encoding(response_encoding)
return self.requests
开发者ID:bihicheng,项目名称:scrapy,代码行数:11,代码来源:reqext.py
示例19: parse_index
def parse_index(self,response):
self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
#self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
if response.status / 100 != 2:
return
for href in response.xpath('//div[@class="az"]/ul/li/p/a/@href').extract():
if "policy.php" in href:
continue
abs_url =urljoin_rfc(response.url,href)
yield scrapy.Request(url=abs_url+"/article/latestArticlesByJournal")
yield self.baidu_rpc_request({"url":abs_url,"src_id":22},response.url)
开发者ID:muzichenglong,项目名称:scrapyc,代码行数:11,代码来源:sciencemeta-net.py
示例20: parse_content
def parse_content(self,response):
self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
#self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
if response.status / 100 != 2:
yield scrapy.Request(url=response.url,callback=self.parse_content)
return
base_url = get_base_url(response)
#解析文章
for href in response.xpath("//em/a/@href").extract():
relative_url = href
abs_url =urljoin_rfc(base_url,relative_url)
yield self.baidu_rpc_request({"url":abs_url,"src_id":22},furl=response.url)
开发者ID:muzichenglong,项目名称:scrapyc,代码行数:12,代码来源:cqvip.py
注:本文中的w3lib.url.urljoin_rfc函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论