本文整理汇总了Python中utils.inspector.save_report函数的典型用法代码示例。如果您正苦于以下问题:Python save_report函数的具体用法?Python save_report怎么用?Python save_report使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了save_report函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: run
def run(options):
year_range = inspector.year_range(options)
topics = options.get('topics')
if topics:
topics = topics.split(",")
else:
topics = TOPIC_TO_URL.keys()
for topic in topics:
topic_url = TOPIC_TO_URL[topic]
body = utils.download(topic_url)
doc = BeautifulSoup(body)
try:
year_results = doc.select("#Listing")[0]
results = [x for x in year_results.select("ul li ul li")]
except IndexError:
try:
all_results = doc.select("#bodyholder")[0]
results = [x for x in all_results.select("ul li")]
except IndexError:
results = doc.select("table ul li")
# Sometimes multiple reports are listed under the same datetime element.
# We store which published datetime we saw last so that the next report
# can use if if we are unable to find another published time.
last_published_on = None
for result in results:
report, last_published_on = report_from(result, topic_url, topic, year_range, last_published_on)
if report:
inspector.save_report(report)
开发者ID:stvnrlly,项目名称:inspectors-general,代码行数:31,代码来源:sec.py
示例2: run
def run(options):
year_range = inspector.year_range(options, archive)
# Pull the audit reports
for page in range(1, 1000):
doc = beautifulsoup_from_url("{}?RS={}".format(REPORTS_URL, page))
results = doc.select("div.leadin")
if not results:
if page == 1:
raise inspector.NoReportsFoundError("VA (audit reports)")
else:
break
for result in results:
report = report_from(result, year_range)
if report:
inspector.save_report(report)
# Pull the semiannual reports
doc = beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
results = doc.select("div.leadin")
if not results:
raise inspector.NoReportsFoundError("VA (semiannual reports)")
for result in results:
report = semiannual_report_from(result, year_range)
if report:
inspector.save_report(report)
开发者ID:Cloudxtreme,项目名称:inspectors-general,代码行数:26,代码来源:va.py
示例3: run
def run(options):
year_range = inspector.year_range(options, archive)
# Pull the audit reports
for year in year_range:
url = AUDITS_REPORTS_URL.format(str(year)[2:4])
doc = BeautifulSoup(utils.download(url))
results = doc.select("tr")
if not results:
raise inspector.NoReportsFoundError("NASA (%d)" % year)
for index, result in enumerate(results):
if not index or not result.text.strip():
# Skip the header row and any empty rows
continue
report = audit_report_from(result, url, year_range)
if report:
inspector.save_report(report)
# Pull the other reports
doc = BeautifulSoup(utils.download(OTHER_REPORT_URL))
results = doc.select("#subContainer ul li")
if not results:
raise inspector.NoReportsFoundError("NASA (other)")
for result in results:
report = other_report_from(result, year_range)
if report:
inspector.save_report(report)
开发者ID:Cloudxtreme,项目名称:inspectors-general,代码行数:27,代码来源:nasa.py
示例4: scrape_reports
def scrape_reports(options):
"""Pull reports from "Reports and Testimonies - Browse by date" web page."""
REPORTS_URL = 'http://www.gao.gov/browse/date/custom?adv_begin_date=01/01/' +\
'%s&adv_end_date=12/31/%s&rows=50&o=%s' # % (year, year, offset)
archive = 1970
# Amazingly, reports go back to 1940, though those are unlikely to be
# legible enough to OCR. Also very cool, even 1950s-era reports seem to have
# a highlightable embedded text layer in them. Of course, it was the
# General Accounting Office back then and less oversighty.
year_range = inspector.year_range(options, archive)
for year in year_range:
is_next_page = True
offset = 0
while is_next_page:
doc = utils.beautifulsoup_from_url(
REPORTS_URL % (year, year, offset))
results = doc.select("div.listing")
for result in results:
report = process_report(result, year_range)
if report:
inspector.save_report(report)
page_links = doc.select("a.non-current_page")
if len(page_links) and page_links[-1].text.startswith('Next'):
offset += 50
else:
is_next_page = False
开发者ID:unitedstates,项目名称:inspectors-general,代码行数:28,代码来源:gaoreports.py
示例5: run
def run(options):
year_range = inspector.year_range(options, archive)
# Suggested flow, for an IG which paginates results.
pages = options.get('pages', ALL_PAGES)
for page in range(1, (int(pages) + 1)):
data = {
'view_name': 'oig_nodes',
'view_display_id': 'block_search_oig_reports',
}
if page:
# Only add page= if page > 0
data['page'] = page
response = utils.scraper.post(REPORTS_AJAX_URL,
data=data,
headers={
"Content-Type": "application/x-www-form-urlencoded",
},
)
page_html = response.json()[1]['data']
doc = BeautifulSoup(page_html)
results = doc.select("tr")
if not results:
break
for index, result in enumerate(results):
if not index:
# Skip the header row
continue
report = report_from(result, year_range)
if report:
inspector.save_report(report)
开发者ID:slobdell,项目名称:inspectors-general,代码行数:33,代码来源:sba.py
示例6: run
def run(options):
year_range = inspector.year_range(options, archive)
pre_1998_done = False
# Pull the audit reports
for year in year_range:
if year < 1998:
if pre_1998_done:
continue
else:
pre_1998_done = True
for page_number in range(0, 10000):
year_url = url_for(year, page_number)
doc = beautifulsoup_from_url(year_url)
results = doc.select("ol li")
if not results:
if page_number == 0:
raise inspector.NoReportsFoundError("Department of Labor (%s)" % year_url)
else:
break
for result in results:
report = report_from(result, year_url)
if report:
inspector.save_report(report)
# Pull the semiannual reports
doc = beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
results = doc.select("p > a:nth-of-type(1)")
if not results:
raise inspector.NoReportsFoundError("Department of Labor (semiannal reports)")
for result in results:
report = semiannual_report_from(result, year_range)
if report:
inspector.save_report(report)
开发者ID:Cloudxtreme,项目名称:inspectors-general,代码行数:35,代码来源:labor.py
示例7: run
def run(options):
year_range = inspector.year_range(options, archive)
report_flag = False
# Pull the table of reports for each year
for year in year_range:
url = url_for_year(year)
html = utils.download(url, scraper_slug="osc")
if html is None:
if year == max(year_range):
continue
else:
raise Exception("Couldn't fetch reports page {}".format(url))
# spaces appear as   and \u200b .... fix that now
html = html.replace(' ', ' ').replace('\u200b', ' ').replace('\u00a0', ' ').replace('\r', '').replace('\n', '')
doc = BeautifulSoup(html, "lxml")
OUTCOME_CODES = generate_outcome_codes(doc)
keys_used = [] # a few reports appear multiple times... ignore them the second time if they appear more than once
results = doc.findAll("table")[1].tbody.findAll('tr') # no ids on the tables, but it's the second one
for result in results:
reports = report_from(result, year, year_range, url, OUTCOME_CODES)
for report in reports:
if report['report_id'] not in keys_used:
inspector.save_report(report)
keys_used.append(report['report_id'])
report_flag = True
if not report_flag:
raise inspector.NoReportsFoundError("OSC")
开发者ID:unitedstates,项目名称:inspectors-general,代码行数:34,代码来源:osc.py
示例8: run
def run(options):
year_range = inspector.year_range(options, archive)
min_year = min(year_range)
page = 0
last_page = 0
while page <= last_page:
doc = utils.beautifulsoup_from_url(REPORT_SEARCH_URL.format(min_year, page))
last_page_link = doc.find("a", title="Go to last page")
if last_page_link:
href = last_page_link["href"]
page_match = re.search("[?&]page=([0-9]+)(?:&|$)", href)
if page_match:
last_page = int(page_match.group(1))
results = doc.select(".view-reports-advanced-search .views-row")
if not results:
raise inspector.NoReportsFoundError("Department of the Interior")
for result in results:
report = report_from(result, year_range)
if report:
inspector.save_report(report)
page += 1
if last_page == 0:
raise Exception("Did not find last page link")
开发者ID:unitedstates,项目名称:inspectors-general,代码行数:25,代码来源:interior.py
示例9: run
def run(options):
year_range = inspector.year_range(options, archive)
# Pull the reports with pagination
for report_type, report_url_format in PAGINATED_REPORT_FORMATS.items():
for page in range(0, 999):
url = report_url_format.format(page=page)
doc = BeautifulSoup(utils.download(url))
results = doc.select("li.views-row")
if not results:
if page == 0:
raise inspector.NoReportsFoundError("USAID (%s)" % report_type)
else:
break
for result in results:
report = report_from(result, url, report_type, year_range)
if report:
inspector.save_report(report)
# Pull the semiannual reports (no pagination)
doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
results = doc.select("li.views-row")
if not results:
raise inspector.NoReportsFoundError("USAID (semiannual reports)")
for result in results:
report = semiannual_report_from(result, year_range)
if report:
inspector.save_report(report)
开发者ID:Cloudxtreme,项目名称:inspectors-general,代码行数:29,代码来源:usaid.py
示例10: run
def run(options):
year_range = inspector.year_range(options, archive)
pages = options.get('pages', ALL_PAGES)
# default to starting at page 1
begin = int(options.get('begin', 1))
max_page = None
for page in range(begin, (int(pages) + 1)):
if max_page and (page > max_page):
logging.debug("End of pages!")
break
logging.debug("## Downloading page %i" % page)
url = url_for(options, page)
body = utils.download(url)
doc = BeautifulSoup(body)
# When the USPS restores their page controls, we can use this again,
# which saves one network call each time.
max_page = last_page_for(doc)
results = doc.select(".views-row")
for result in results:
report = report_from(result)
# inefficient enforcement of --year arg, USPS doesn't support it server-side
# TODO: change to published_on.year once it's a datetime
if inspector.year_from(report) not in year_range:
logging.warn("[%s] Skipping report, not in requested range." % report['report_id'])
continue
inspector.save_report(report)
开发者ID:slobdell,项目名称:inspectors-general,代码行数:34,代码来源:usps.py
示例11: run
def run(options):
year_range = inspector.year_range(options, archive)
# Pull the reports
for report_type, report_url in REPORT_URLS:
doc = utils.beautifulsoup_from_url(report_url)
results = doc.select("td.mainInner div.ms-WPBody > div > ul > li")
if not results:
raise inspector.NoReportsFoundError("SIGTARP ({})".format(report_type))
for result in results:
report = report_from(result, report_type, year_range)
if report:
inspector.save_report(report)
doc = utils.beautifulsoup_from_url(QUARTERLY_REPORTS_URL)
results = doc.select("#MSOZoneCell_WebPartWPQ3 .s4-wpTopTable a")
if not results:
raise inspector.NoReportsFoundError("SIGTARP (quarterly reports)")
for result in results:
report = quarterly_report_from(result, year_range)
if report:
inspector.save_report(report)
开发者ID:unitedstates,项目名称:inspectors-general,代码行数:26,代码来源:sigtarp.py
示例12: run
def run(options):
year_range = inspector.year_range(options)
for page_url in URLS:
done = False
body = utils.download(page_url)
doc = BeautifulSoup(body)
maincontent = doc.select("div#CS_Element_eximpagemaincontent")[0]
all_p = maincontent.find_all("p")
for p in all_p:
for all_text, link_text, link_url in recurse_tree(p, False):
if link_url == None:
continue
if link_url.startswith("mailto:"):
continue
if page_url == WHATS_NEW_URL and link_url == "/oig/whats-new-archive.cfm":
# end of page
done = True
break
if link_url.startswith("https://public.govdelivery.com/"):
continue
for index_url in URLS:
if index_url.find(link_url) != -1:
continue
year = DATE_RE.search(all_text).group(3)
if int(year) not in year_range:
continue
report = report_from(all_text, link_text, link_url, page_url)
inspector.save_report(report)
if done: break
开发者ID:spulec,项目名称:inspectors-general,代码行数:34,代码来源:exim.py
示例13: run
def run(options):
year_range = inspector.year_range(options)
max_pages = int(options.get('pages', 1))
for year in year_range:
page = 1
done = False
while not done:
url = url_for(options, page, year)
body = utils.download(url)
doc = BeautifulSoup(body)
next_page = page + 1
found_next_page = False
page_links = doc.select("li.pager-item a.active")
for page_link in page_links:
if page_link.text == str(next_page):
found_next_page = True
break
if not found_next_page:
done = True
if next_page > max_pages:
done = True
results = doc.select("table.views-table > tbody > tr")
for result in results:
report = report_from(result)
inspector.save_report(report)
page = next_page
if not done:
print('Moving to next page (%d)' % page)
开发者ID:spulec,项目名称:inspectors-general,代码行数:32,代码来源:amtrak.py
示例14: extract_reports_for_subtopic
def extract_reports_for_subtopic(subtopic_url, year_range, topic_name, subtopic_name):
doc = beautifulsoup_from_url(subtopic_url)
if not doc:
raise Exception("Failure fetching subtopic URL: %s" % subtopic_url)
results = None
# This URL is different than the rest and needs to find the "p > a"s first.
if subtopic_url == TOPIC_TO_URL['TMPC']:
results = doc.select("#leftContentInterior > p > a")
if not results:
results = doc.select("#leftContentInterior dl dd")
if not results:
results = doc.select("#leftContentInterior ul li")
if not results:
results = doc.select("#leftContentInterior > p > a")
if not results:
raise inspector.NoReportsFoundError("HHS (%s)" % subtopic_name)
for result in results:
if 'crossref' in result.parent.parent.attrs.get('class', []):
continue
if result.parent.parent.attrs.get('id') == 'related':
continue
report = report_from(result, year_range, topic_name, subtopic_url, subtopic_name)
if report:
inspector.save_report(report)
开发者ID:Cloudxtreme,项目名称:inspectors-general,代码行数:26,代码来源:hhs.py
示例15: run
def run(options):
year_range = inspector.year_range(options, archive)
# Pull the reports with pagination
for report_type, report_url_format in PAGINATED_REPORT_FORMATS:
for page in range(0, 999):
url = report_url_format.format(page=page)
doc = utils.beautifulsoup_from_url(url)
if report_type == "audit" and page == 0 and not doc.select("div.views-field-field-auditreport-doc-1"):
raise Exception("Report number CSS class has changed")
results = doc.select("li.views-row")
if not results:
if page == 0:
raise inspector.NoReportsFoundError("USAID (%s)" % report_type)
else:
break
for result in results:
report = report_from(result, url, report_type, year_range)
if report:
inspector.save_report(report)
# Pull the semiannual reports (no pagination)
doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
results = doc.select("li.views-row")
if not results:
raise inspector.NoReportsFoundError("USAID (semiannual reports)")
for result in results:
report = semiannual_report_from(result, year_range)
if report:
inspector.save_report(report)
开发者ID:unitedstates,项目名称:inspectors-general,代码行数:31,代码来源:usaid.py
示例16: run
def run(options):
year_range = inspector.year_range(options)
only_id = options.get('report_id')
print("## Downloading reports from %i to %i" % (year_range[0], year_range[-1]))
url = url_for()
body = utils.download(url)
doc = BeautifulSoup(body)
results = doc.select("section")
for result in results:
try:
year = int(result.get("title"))
# check that the fetched year is in the range
if year not in year_range:
continue
print("## Downloading year %i " % year)
except ValueError:
continue
# gets each table entry and sends generates a report from it
listings = result.div.table.tbody.contents
for item in listings:
if type(item) is not bs4.element.Tag:
continue
report = report_from(item)
# can limit it to just one report, for debugging convenience
if only_id and only_id != report['report_id']:
continue
inspector.save_report(report)
开发者ID:spulec,项目名称:inspectors-general,代码行数:34,代码来源:opm.py
示例17: run
def run(options):
year_range = inspector.year_range(options, archive)
pages = options.get('pages', ALL_PAGES)
# Pull the audit reports. Pages are 0-indexed.
for page in range(0, int(pages) - 1):
doc = BeautifulSoup(utils.download(AUDIT_REPORTS_URL.format(page=page)))
results = doc.select("span.field-content")
if not results:
# No more results, we must have hit the last page
break
for result in results:
report = report_from(result, year_range, report_type='audit')
if report:
inspector.save_report(report)
# Grab the other reports
for report_type, url in OTHER_REPORT_URLS.items():
doc = BeautifulSoup(utils.download(url))
results = doc.select(".views-field")
if not results:
results = doc.select(".views-row")
for result in results:
report = report_from(result, year_range, report_type)
if report:
inspector.save_report(report)
开发者ID:slobdell,项目名称:inspectors-general,代码行数:27,代码来源:fhfa.py
示例18: run
def run(options):
year_range = inspector.year_range(options)
# Pull the audit reports
for year in year_range:
url = audit_report_url(year)
if url:
parse_result_from_js_url(url, "auditreports", year, year_range)
url = inspection_report_url(year)
if url:
parse_result_from_js_url(url, "iereports", year, year_range)
# Pull the congressional testimony
doc = BeautifulSoup(utils.download(CONGRESSIONAL_TESTIMONY_REPORTS_URL))
results = doc.findAll("ul", type='disc')[0].select("li")
for result in results:
report = congressional_testimony_report_from(result, year_range)
if report:
inspector.save_report(report)
# Pull the semiannual reports
doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
results = doc.findAll("ul", type='disc')[0].select("li")
for result in results:
report = semiannual_report_from(result, year_range)
if report:
inspector.save_report(report)
开发者ID:BunsenMcDubbs,项目名称:inspectors-general,代码行数:27,代码来源:tigta.py
示例19: scrape_restricted_reports
def scrape_restricted_reports(options):
"""Restricted Products.
A single HTML page lists unreleased reports since 2014, with no links."""
# These reports are unreleased -- we could make this the text?
"""The following products have been determined to contain either
classified information or controlled unclassified information by the audited
agencies and cannot be publicly released.
Members of Congress or congressional staff who wish to obtain one or more of
these products should call or e-mail the Congressional Relations Office.
All others who wish to obtain one or more of these products should follow the
instructions found on Requesting Restricted Products."""
REPORTS_URL = 'http://www.gao.gov/restricted/restricted_reports'
archive = 2014
year_range = inspector.year_range(options, archive)
doc = utils.beautifulsoup_from_url(REPORTS_URL)
results = doc.select("div.listing")
for result in results:
report = process_restricted_report(result, year_range, REPORTS_URL)
if report:
inspector.save_report(report)
开发者ID:unitedstates,项目名称:inspectors-general,代码行数:25,代码来源:gaoreports.py
示例20: run
def run(options):
year_range = inspector.year_range(options, archive)
# Find the number of pages to iterate
doc = BeautifulSoup(utils.download(REPORTS_URL))
page_count_text = doc.select("div.AspNet-GridView-Pagination")[0].text
page_count = int(re.search("Page 1 of (\d+)", page_count_text).groups()[0])
# Iterate over those pages
for page in range(1, page_count + 1):
response = utils.scraper.post(
REPORTS_URL,
data={
"__EVENTTARGET": "ctl00$ctl00$MainContent$NavTreeSubContent$sv$GridViewSummary",
"__EVENTARGUMENT": "Page${page_number}".format(page_number=page),
},
cookies=COOKIES,
)
doc = BeautifulSoup(response.content)
results = doc.select("div.AspNet-GridView table tr")
if not results:
break
for index, result in enumerate(results):
if not index:
# Skip the header row
continue
report = report_from(result, year_range)
if report:
inspector.save_report(report)
开发者ID:slobdell,项目名称:inspectors-general,代码行数:29,代码来源:prc.py
注:本文中的utils.inspector.save_report函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论