本文整理汇总了Python中utils.utils.download函数的典型用法代码示例。如果您正苦于以下问题:Python download函数的具体用法?Python download怎么用?Python download使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了download函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: run
def run(options):
year_range = inspector.year_range(options, archive)
# # Pull the RSS feed
doc = BeautifulSoup(utils.download(RSS_URL))
results = doc.select("item")
for result in results:
report = rss_report_from(result, year_range)
if report:
inspector.save_report(report)
# # Pull the recent audit reports.
doc = BeautifulSoup(utils.download(RECENT_AUDITS_URL))
results = doc.select("div.block > a")
for result in results:
report = report_from(result, year_range)
if report:
inspector.save_report(report)
# Pull the archive audit reports
doc = BeautifulSoup(utils.download(AUDIT_ARCHIVE_URL))
results = doc.select("div.block a")
for result in results:
report = report_from(result, year_range)
if report:
inspector.save_report(report)
# Pull the other reports
doc = BeautifulSoup(utils.download(OTHER_REPORTS_URl))
results = doc.select("div.block > a")
for result in results:
report = report_from(result, year_range)
if report:
inspector.save_report(report)
开发者ID:slobdell,项目名称:inspectors-general,代码行数:34,代码来源:smithsonian.py
示例2: run
def run(options):
year_range = inspector.year_range(options, archive)
# Pull the audit reports
for year in year_range:
if year < 2002: # The oldest page for audit reports
continue
doc = BeautifulSoup(utils.download(AUDIT_REPORTS_URL.format(year=year)))
results = doc.select("div.content table tr")
for index, result in enumerate(results):
if not index:
# Skip the header row
continue
report = report_from(result, report_type="audit", year_range=year_range)
if report:
inspector.save_report(report)
# Pull the FOIA reports
doc = BeautifulSoup(utils.download(FOIA_REPORTS_URL))
results = doc.select("div.content table tr")
for index, result in enumerate(results):
if not index:
# Skip the header row
continue
report = report_from(result, report_type="other", year_range=year_range)
if report:
inspector.save_report(report)
# Pull the semiannual reports
doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
results = doc.select("div.content a")
for result in results:
report = semiannual_report_from(result, year_range)
if report:
inspector.save_report(report)
开发者ID:slobdell,项目名称:inspectors-general,代码行数:35,代码来源:ncua.py
示例3: run
def run(options):
year_range = inspector.year_range(options, archive)
# Pull the audit reports
for year in year_range:
if year < 2005: # This is the earliest audits go back
continue
url = AUDIT_REPORTS_URL.format(year=year)
doc = BeautifulSoup(utils.download(url))
results = doc.select("div.content")
if not results:
raise inspector.NoReportsFoundError("Tennessee Valley Authority (%d)" % year)
for result in results:
report = audit_report_from(result, url, year_range)
if report:
inspector.save_report(report)
# Pull the semiannual reports
doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
results = doc.select("report")
if not results:
raise inspector.NoReportsFoundError("Tennessee Valley Authority (semiannual reports)")
for result in results:
report = semiannual_report_from(result, year_range)
if report:
inspector.save_report(report)
开发者ID:Cloudxtreme,项目名称:inspectors-general,代码行数:26,代码来源:tva.py
示例4: run
def run(options):
year_range = inspector.year_range(options)
# Pull the audit reports
for year in year_range:
url = audit_report_url(year)
if url:
parse_result_from_js_url(url, "auditreports", year, year_range)
url = inspection_report_url(year)
if url:
parse_result_from_js_url(url, "iereports", year, year_range)
# Pull the congressional testimony
doc = BeautifulSoup(utils.download(CONGRESSIONAL_TESTIMONY_REPORTS_URL))
results = doc.findAll("ul", type='disc')[0].select("li")
for result in results:
report = congressional_testimony_report_from(result, year_range)
if report:
inspector.save_report(report)
# Pull the semiannual reports
doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
results = doc.findAll("ul", type='disc')[0].select("li")
for result in results:
report = semiannual_report_from(result, year_range)
if report:
inspector.save_report(report)
开发者ID:BunsenMcDubbs,项目名称:inspectors-general,代码行数:27,代码来源:tigta.py
示例5: run
def run(options):
year_range = inspector.year_range(options, archive)
# Pull the general reports
doc = BeautifulSoup(utils.download(REPORTS_URL))
results = doc.select("div#mainContent li.mainContenttext a")
for result in results:
report = report_from(result, REPORTS_URL, year_range)
if report:
inspector.save_report(report)
# Pull the archive reports
doc = BeautifulSoup(utils.download(REPORT_ARCHIVE_URL))
results = doc.select("div#mainContent li.mainContenttext a") + doc.select("div#mainContent span.mainContenttext a")
for result in results:
if not result.text:
continue
report = report_from(result, REPORT_ARCHIVE_URL, year_range)
if report:
inspector.save_report(report)
# Pull the semiannual reports
doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
results = doc.select("div#mainContent li.mainContenttext a")
for result in results:
report = semiannual_report_from(result, year_range)
if report:
inspector.save_report(report)
开发者ID:slobdell,项目名称:inspectors-general,代码行数:28,代码来源:fca.py
示例6: urls_for_topics
def urls_for_topics(self, topics):
for topic in topics:
# Topic might be a tuple for ADDITIONAL_TOPICS (not ones from command
# line).
self.report_type = None
if isinstance(topic, tuple):
topic, report_type = topic
self.report_type = report_type
last_page = False
url = TOPIC_TO_URL[topic]
page = BeautifulSoup(utils.download(url))
page_started = self.is_first_page(page)
if page_started:
yield url
for link in page.select('li.pager-item a'):
next_url = urljoin(url, link['href'])
next_page = BeautifulSoup(utils.download(next_url))
if not page_started:
page_started = self.is_first_page(next_page)
if page_started:
yield next_url
last_page = self.is_last_page(next_page)
if last_page:
break
if last_page:
continue
self.report_type = None # Clear this out afterwards
开发者ID:slobdell,项目名称:inspectors-general,代码行数:30,代码来源:energy.py
示例7: urls_for
def urls_for(self):
only = self.options.get('topics')
if only: # if only...
only = set(only.split(','))
only = [(o, TOPIC_TO_REPORT_TYPE[o]) if o in TOPIC_TO_REPORT_TYPE else o
for o in only]
yield from self.urls_for_topics(only)
# If there are topics selected, ONLY yield URLs for those.
return
# First yield the URLs for the topics that are tangential to the main
# Calendar Year reports.
yield from self.urls_for_topics(ADDITIONAL_TOPICS)
# Not getting reports from specific topics, iterate over all Calendar Year
# reports.
page = BeautifulSoup(utils.download(BASE_URL))
# Iterate over each "Calendar Year XXXX" link
for li in page.select('.field-items li'):
md = RE_CALENDAR_YEAR.search(li.text)
if md:
cur_year = int(md.group(1))
if cur_year >= self.year_range[0] and cur_year <= self.year_range[-1]:
href = li.select('a')[0]['href']
next_url = urljoin(BASE_URL, href)
# The first page of reports is yielded.
yield next_url
# Next, read all the pagination links for the page and yield those. So
# far, I haven't seen a page that doesn't have all of the following
# pages enumerated.
next_page = BeautifulSoup(utils.download(next_url))
for link in next_page.select('li.pager-item a'):
yield urljoin(BASE_URL, link['href'])
开发者ID:slobdell,项目名称:inspectors-general,代码行数:35,代码来源:energy.py
示例8: run
def run(options):
year_range = inspector.year_range(options, archive)
# Pull the audit reports
for year in year_range:
url = AUDITS_REPORTS_URL.format(str(year)[2:4])
doc = BeautifulSoup(utils.download(url))
results = doc.select("tr")
if not results:
raise inspector.NoReportsFoundError("NASA (%d)" % year)
for index, result in enumerate(results):
if not index or not result.text.strip():
# Skip the header row and any empty rows
continue
report = audit_report_from(result, url, year_range)
if report:
inspector.save_report(report)
# Pull the other reports
doc = BeautifulSoup(utils.download(OTHER_REPORT_URL))
results = doc.select("#subContainer ul li")
if not results:
raise inspector.NoReportsFoundError("NASA (other)")
for result in results:
report = other_report_from(result, year_range)
if report:
inspector.save_report(report)
开发者ID:Cloudxtreme,项目名称:inspectors-general,代码行数:27,代码来源:nasa.py
示例9: run
def run(options):
year_range = inspector.year_range(options, archive)
# Pull the reports with pagination
for report_type, report_url_format in PAGINATED_REPORT_FORMATS.items():
for page in range(0, 999):
url = report_url_format.format(page=page)
doc = BeautifulSoup(utils.download(url))
results = doc.select("li.views-row")
if not results:
if page == 0:
raise inspector.NoReportsFoundError("USAID (%s)" % report_type)
else:
break
for result in results:
report = report_from(result, url, report_type, year_range)
if report:
inspector.save_report(report)
# Pull the semiannual reports (no pagination)
doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
results = doc.select("li.views-row")
if not results:
raise inspector.NoReportsFoundError("USAID (semiannual reports)")
for result in results:
report = semiannual_report_from(result, year_range)
if report:
inspector.save_report(report)
开发者ID:Cloudxtreme,项目名称:inspectors-general,代码行数:29,代码来源:usaid.py
示例10: run
def run(options):
year_range = inspector.year_range(options, archive)
doc = BeautifulSoup(utils.download(REPORTS_URL))
# Pull the semiannual reports
semiannul_results = doc.select("#AnnualManagementReports select")[0]
for result in semiannul_results.select("option"):
report = semiannual_report_from(result, year_range)
if report:
inspector.save_report(report)
# Pull the special reports
special_report_table = doc.find("table", attrs={"bordercolor": "#808080"})
for index, result in enumerate(special_report_table.select("tr")):
if not index:
# Skip the header row
continue
report = report_from(result, REPORTS_URL, report_type='other', year_range=year_range)
if report:
inspector.save_report(report)
# Pull the audit reports
for year in year_range:
if year < 2001: # The oldest fiscal year page available
continue
year_url = AUDIT_REPORTS_URL.format(year=year)
doc = BeautifulSoup(utils.download(year_url))
for index, result in enumerate(doc.select("#main table tr")):
if not index:
# Skip the header row
continue
report = report_from(result, year_url, report_type='audit', year_range=year_range)
if report:
inspector.save_report(report)
开发者ID:slobdell,项目名称:inspectors-general,代码行数:35,代码来源:rrb.py
示例11: run
def run(options):
year_range = inspector.year_range(options, archive)
pages = options.get('pages', ALL_PAGES)
# Pull the audit reports. Pages are 0-indexed.
for page in range(0, int(pages) - 1):
doc = BeautifulSoup(utils.download(AUDIT_REPORTS_URL.format(page=page)))
results = doc.select("span.field-content")
if not results:
# No more results, we must have hit the last page
break
for result in results:
report = report_from(result, year_range, report_type='audit')
if report:
inspector.save_report(report)
# Grab the other reports
for report_type, url in OTHER_REPORT_URLS.items():
doc = BeautifulSoup(utils.download(url))
results = doc.select(".views-field")
if not results:
results = doc.select(".views-row")
for result in results:
report = report_from(result, year_range, report_type)
if report:
inspector.save_report(report)
开发者ID:slobdell,项目名称:inspectors-general,代码行数:27,代码来源:fhfa.py
示例12: run
def run(options):
year_range = inspector.year_range(options, archive)
# Pull the audit reports
for year in year_range:
if year < 2006: # The oldest year for audit reports
continue
url = AUDIT_REPORTS_URL.format(year=year)
doc = BeautifulSoup(utils.download(url))
results = doc.select("div#content li")
for result in results:
report = audit_report_from(result, url, year, year_range)
if report:
inspector.save_report(report)
# Pull the semiannual reports
doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
results = doc.select("div#content li")
for result in results:
report = semiannual_report_from(result, year_range)
if report:
inspector.save_report(report)
# Pull the Peer Review
doc = BeautifulSoup(utils.download(PEER_REVIEWS_URL))
result = doc.find("div", id='content').find("a", text=True)
report = peer_review_from(result, year_range)
inspector.save_report(report)
开发者ID:slobdell,项目名称:inspectors-general,代码行数:28,代码来源:archives.py
示例13: extract_reports_for_oei
def extract_reports_for_oei(year_range):
topic_name = TOPIC_NAMES["OE"]
topic_url = TOPIC_TO_URL["OE"]
root_body = utils.download(topic_url)
root_doc = BeautifulSoup(root_body)
letter_urls = set()
for link in root_doc.select("#leftContentInterior li a"):
absolute_url = urljoin(topic_url, link['href'])
absolute_url = strip_url_fragment(absolute_url)
letter_urls.add(absolute_url)
if not letter_urls:
raise inspector.NoReportsFoundError("HHS (OEI first pass)")
all_results_links = {}
all_results_unreleased = []
for letter_url in letter_urls:
letter_body = utils.download(letter_url)
letter_doc = BeautifulSoup(letter_body)
results = letter_doc.select("#leftContentInterior ul li")
if not results:
raise inspector.NoReportsFoundError("HHS (OEI %s)" % letter_url)
for result in results:
if 'crossref' in result.parent.parent.attrs.get('class', []):
continue
if result.parent.parent.attrs.get('id') == 'related':
continue
node = result
while node and node.name != "h2":
node = node.previous
if node and node.name == "h2":
subtopic_name = str(node.text)
else:
subtopic_name = "(unknown)"
links = result.findAll("a")
if len(links) == 0:
result.extract()
all_results_unreleased.append([result, subtopic_name])
else:
url = links[0].get("href")
if url not in all_results_links:
result.extract()
all_results_links[url] = [result, subtopic_name]
else:
existing_result = all_results_links[url][0]
for temp in result.contents:
temp.extract()
existing_result.append(temp)
all_results_links[url][1] = "%s, %s" % (all_results_links[url][1], subtopic_name)
subtopic_url = TOPIC_TO_URL["OE"]
for result, subtopic_name in itertools.chain(all_results_links.values(), all_results_unreleased):
report = report_from(result, year_range, topic_name, subtopic_url, subtopic_name)
if report:
inspector.save_report(report)
开发者ID:Cloudxtreme,项目名称:inspectors-general,代码行数:59,代码来源:hhs.py
示例14: run
def run(options):
year_range = inspector.year_range(options, archive)
# Pull the audit reports
doc = BeautifulSoup(utils.download(AUDIT_REPORTS_URL))
results = doc.select("td.text table tr")
if not results:
raise inspector.NoReportsFoundError("National Science Foundation (audit reports")
for result in results:
# ignore divider lines
if result.select("img"): continue
report = report_from(result, report_type='audit', year_range=year_range)
if report:
inspector.save_report(report)
# Pull the semiannual reports
doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
results = doc.select("td.text table tr")
if not results:
raise inspector.NoReportsFoundError("National Science Foundation (semiannual reports)")
for result in results:
if not result.text.strip():
continue
report = semiannual_report_from(result, year_range)
if report:
inspector.save_report(report)
# Pull the case reports
response = utils.scraper.post(
url=CASE_REPORTS_URL,
data=CASE_REPORTS_DATA,
)
doc = BeautifulSoup(response.content)
results = doc.select("td.text table tr")
if not results:
raise inspector.NoReportsFoundError("National Science Foundation (case reports)")
for index, result in enumerate(results):
if not index or not result.text.strip(): # Skip the header row and empty rows
continue
report = case_report_from(result, CASE_REPORTS_URL, year_range)
if report:
inspector.save_report(report)
# Pull the testimony
doc = BeautifulSoup(utils.download(TESTIMONY_REPORTS_URL))
results = doc.select("td.text table tr")
if not results:
raise inspector.NoReportsFoundError("National Science Foundation (testimony)")
for result in results:
if not result.text.strip():
continue
report = report_from(result, report_type='testimony', year_range=year_range)
if report:
inspector.save_report(report)
开发者ID:harrisj,项目名称:inspectors-general,代码行数:55,代码来源:nsf.py
示例15: handle_scanner_args
def handle_scanner_args(args, opts) -> Tuple[dict, list]:
"""
--analytics: file path or URL to a CSV of participating domains.
This function also handles checking for the existence of the file,
downloading it succesfully, and reading the file in order to populate the
list of analytics domains.
"""
parser = scan_utils.ArgumentParser(prefix_chars="--")
parser.add_argument("--analytics", nargs=1, required=True)
parsed, unknown = parser.parse_known_args(args)
dicted = vars(parsed)
should_be_single = ["analytics"]
dicted = scan_utils.make_values_single(dicted, should_be_single)
resource = dicted.get("analytics")
if not resource.endswith(".csv"):
no_csv = "".join([
"--analytics should be the file path or URL to a CSV of participating",
" domains and end with .csv, which '%s' does not" % resource
])
logging.error(no_csv)
raise argparse.ArgumentTypeError(no_csv)
try:
parsed_url = urlparse(resource)
except:
raise
if parsed_url.scheme and parsed_url.scheme in ("http", "https"):
analytics_path = Path(opts["_"]["cache_dir"], "analytics.csv").resolve()
try:
utils.download(resource, str(analytics_path))
except:
logging.error(utils.format_last_exception())
no_csv = "--analytics URL %s not downloaded successfully." % resource
logging.error(no_csv)
raise argparse.ArgumentTypeError(no_csv)
else:
if not os.path.exists(resource):
no_csv = "--analytics file %s not found." % resource
logging.error(no_csv)
raise FileNotFoundError(no_csv)
else:
analytics_path = resource
analytics_domains = utils.load_domains(analytics_path)
dicted["analytics_domains"] = analytics_domains
del dicted["analytics"]
return (dicted, unknown)
开发者ID:18F,项目名称:domain-scan,代码行数:48,代码来源:analytics.py
示例16: extract_reports_for_subtopic
def extract_reports_for_subtopic(subtopic_url, year_range, topic, subtopic=None):
if subtopic_url.startswith("http://httphttp://"):
# See notes to IG's web team
subtopic_url = subtopic_url.replace("http://http", "")
body = utils.download(subtopic_url)
doc = BeautifulSoup(body)
results = doc.select("#body-row02-col02andcol03 a")
if not results:
results = doc.select("#body-row02-col01andcol02andcol03 a")
if not results and "There are currently no reports in this category" not in doc.text:
raise AssertionError("No report links found for %s" % subtopic_url)
topic_name = TOPIC_NAMES[topic]
# Broadcasting Board of Governors is a fully independent agency
if topic == 'BBG' or subtopic == 'Broadcasting Board of Governors':
agency = 'bbg'
else:
agency = 'state'
for result in results:
report = report_from(result, year_range, agency, topic_name, subtopic)
if report:
inspector.save_report(report)
开发者ID:BunsenMcDubbs,项目名称:inspectors-general,代码行数:25,代码来源:state.py
示例17: semiannual_report_from
def semiannual_report_from(result, year_range):
link = result.find("a")
title = link.text
# Parse the report title. Ex:
# 'OIG Semiannual Report to the Congress: October 1, 2013 - March 31, 2014 (incl. MCC)'
published_on_text = title.split("-")[-1].split("–")[-1].split("(")[0].strip()
published_on_text = published_on_text.replace("September 31", "September 30") # See note to IG Web team
published_on = datetime.datetime.strptime(published_on_text, '%B %d, %Y')
if published_on.year not in year_range:
logging.debug("[%s] Skipping, not in requested range." % title)
return
landing_url = urljoin(SEMIANNUAL_REPORTS_URL, link.get('href'))
landing_page = BeautifulSoup(utils.download(landing_url))
report_url = landing_page.select("div.filefield-file a")[0].get('href')
report_filename = report_url.split("/")[-1]
report_id, _ = os.path.splitext(report_filename)
report = {
'inspector': "usaid",
'inspector_url': "https://oig.usaid.gov",
'agency': "usaid",
'agency_name': "Agency For International Development",
'type': 'semiannual_report',
'report_id': report_id,
'url': report_url,
'title': title,
'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
}
return report
开发者ID:slobdell,项目名称:inspectors-general,代码行数:34,代码来源:usaid.py
示例18: run
def run(options):
year_range = inspector.year_range(options)
doc = BeautifulSoup(utils.download(REPORTS_URL))
# Pull the audit reports
audit_header = doc.find("a", attrs={"name": 'Audit Reports'})
audit_list1 = audit_header.find_next("ul").select("li")
# They have two separate uls for these reports. See note to the IG web team.
audit_list2 = audit_header.find_next("ul").find_next("ul").select("li")
results = audit_list1 + audit_list2
for result in results:
report = report_from(result, year_range)
if report:
inspector.save_report(report)
# Pull the inspection reports
inspections_header = doc.find("a", attrs={"name": 'Inspection Reports'})
results = inspections_header.find_next("ul").select("li")
for result in results:
report = report_from(result, year_range)
if report:
inspector.save_report(report)
# Pull the semiannual reports
semiannual_header = doc.find("a", attrs={"name": 'Semiannual Reports'})
results = semiannual_header.find_next("ul").select("li")
for result in results:
report = report_from(result, year_range, title_prefix="Semiannual Report - ")
if report:
inspector.save_report(report)
开发者ID:JaimeLynSchatz,项目名称:inspectors-general,代码行数:34,代码来源:fec.py
示例19: run
def run(options):
year_range = inspector.year_range(options)
for page_url in URLS:
done = False
body = utils.download(page_url)
doc = BeautifulSoup(body)
maincontent = doc.select("div#CS_Element_eximpagemaincontent")[0]
all_p = maincontent.find_all("p")
for p in all_p:
for all_text, link_text, link_url in recurse_tree(p, False):
if link_url == None:
continue
if link_url.startswith("mailto:"):
continue
if page_url == WHATS_NEW_URL and link_url == "/oig/whats-new-archive.cfm":
# end of page
done = True
break
if link_url.startswith("https://public.govdelivery.com/"):
continue
for index_url in URLS:
if index_url.find(link_url) != -1:
continue
year = DATE_RE.search(all_text).group(3)
if int(year) not in year_range:
continue
report = report_from(all_text, link_text, link_url, page_url)
inspector.save_report(report)
if done: break
开发者ID:spulec,项目名称:inspectors-general,代码行数:34,代码来源:exim.py
示例20: run
def run(options):
year_range = inspector.year_range(options, archive)
topics = options.get('topics')
if topics:
topics = topics.split(",")
else:
topics = TOPIC_TO_URL.keys()
all_reports = {}
for topic in topics:
year_urls = urls_for(year_range, topic)
for year_url in year_urls:
logging.debug("Scraping %s" % year_url)
body = utils.download(year_url)
doc = BeautifulSoup(body)
if not doc.select(".view-business-areas"):
raise inspector.NoReportsFoundError("DOT (%s)" % topic)
results = doc.select(".view-business-areas .views-row")
for result in results:
report = report_from(result, year_range, topic, options)
if report:
report_id = report["report_id"]
if report_id in all_reports:
all_reports[report_id]["topic"] = all_reports[report_id]["topic"] \
+ ", " + topic
else:
all_reports[report_id] = report
for report in all_reports.values():
inspector.save_report(report)
开发者ID:Cloudxtreme,项目名称:inspectors-general,代码行数:35,代码来源:dot.py
注:本文中的utils.utils.download函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论