本文整理汇总了Python中newspaper.Article类的典型用法代码示例。如果您正苦于以下问题:Python Article类的具体用法?Python Article怎么用?Python Article使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Article类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: get_details
def get_details():
url = request.args.get('url', '')
if not url:
abort(400)
if is_image(url):
result = {
"url": url,
"top_image": url,
"text": "",
}
return jsonify(result)
article = Article(url)
article.download()
try:
article.parse()
except (IOError, UnicodeDecodeError):
return '', 422
try:
top_image = article.top_image.rsplit('?',1)[0]
except AttributeError:
top_image = ''
result = {
"url": url,
"top_image": top_image,
"text": article.text,
}
return jsonify(result)
开发者ID:mskog,项目名称:cheapskate,代码行数:33,代码来源:cheapskate.py
示例2: test_spanish_fulltext_extract
def test_spanish_fulltext_extract(self):
url = "http://ultimahora.es/mallorca/noticia/noticias/local/fiscalia-anticorrupcion-estudia-recurre-imputacion-infanta.html"
article = Article(url=url, language="es")
article.download()
article.parse()
with codecs.open(os.path.join(TEXT_FN, "spanish_text_1.txt"), "r", "utf8") as f:
assert article.text == f.read()
开发者ID:WheresWardy,项目名称:newspaper,代码行数:7,代码来源:unit_tests.py
示例3: get_text
def get_text(url):
article = Article(url)
download=article.download()
parser= article.parse()
authors=article.authors
publish_date=article.publish_date # TODO: Slice publish date
body_text=article.text
body_text=body_text.replace('"','\"')
body_text=body_text.replace('"','')
#nlp=article.nlp()
keywords=article.keywords
summary=article.summary
title=article.title
tags=article.tags
#print body_text
title=strip_non_ascii(title)
summary=strip_non_ascii(summary)
body_text=strip_non_ascii(body_text)
keywords=' '.join(keywords)
keywords=strip_non_ascii(keywords)
#print (title, summary, authors, publish_date, body_text, keywords)
return (title, summary, authors, publish_date, body_text, keywords, tags)
开发者ID:chr7stos,项目名称:metabrowser,代码行数:26,代码来源:newspaper_extract.py
示例4: main
def main():
try:
headlines = requests.get(headline_url)
headlines = json.loads(headlines.text)
for headline in headlines['Headlines']:
print("Processing Article %s" % headline['Url'])
article = Article(headline['Url'])
article.download()
article.parse()
response = requests.post(calais_url, files={'file': article.text}, headers=headers, timeout=80)
rdf = json.loads(response.text)
for x in rdf:
if '_type' in rdf[x] and 'name' in rdf[x]:
print("Output for %s %s" % (rdf[x]['_type'], rdf[x]['name']))
for instance in rdf[x]['instances']:
text = instance['prefix'] + instance['suffix']
blob = TextBlob(text)
for sentence in blob.sentences:
print(sentence)
print(sentence.sentiment.polarity)
print('--------------------')
#print(rdf)
except Exception as e:
print ('Error in connect ' , e)
开发者ID:tallstreet,项目名称:pr-classifier,代码行数:29,代码来源:test.py
示例5: test_chinese_fulltext_extract
def test_chinese_fulltext_extract(self):
url = "http://www.bbc.co.uk/zhongwen/simp/chinese_news/2012/12/121210_hongkong_politics.shtml"
article = Article(url=url, language="zh")
article.download()
article.parse()
with codecs.open(os.path.join(TEXT_FN, "chinese_text_1.txt"), "r", "utf8") as f:
assert article.text == f.read()
开发者ID:WheresWardy,项目名称:newspaper,代码行数:7,代码来源:unit_tests.py
示例6: test_chinese_fulltext_extract
def test_chinese_fulltext_extract(self):
url = 'http://www.bbc.co.uk/zhongwen/simp/chinese_news/2012/12/121210_hongkong_politics.shtml'
article = Article(url=url, language='zh')
article.build()
# assert isinstance(article.stopwords_class, StopWordsChinese)
with codecs.open(os.path.join(TEXT_FN, 'chinese_text_1.txt'), 'r', 'utf8') as f:
assert article.text == f.read()
开发者ID:Geekking,项目名称:newspaper,代码行数:7,代码来源:unit_tests.py
示例7: main
def main(argv):
if len(argv) > 1:
htmlist = argv[1]
else:
htmlist = 'htmlist'
# Our permanent config for html cleaning
config = Config()
config.language = 'id'
config.MIN_SENT_COUNT = 20
config.memoize = False
config.fetch_images = False
config.verbose= True
cleaner = Article(url='', config=config)
with open(htmlist, 'r') as f:
htmfile = f.read().split('\n')
raw = []
for htm in htmfile:
print (htm)
if not htm.endswith("rss.html"):
with open(htm, 'r') as f:
h = f.read()
cleaner.set_html(h)
cleaner.parse()
sentences = nlp.split_sentences(cleaner.text)
#raw.append(sentences])
with open('htm-out', 'a') as f:
[f.write(r + '\n') for r in sentences]
开发者ID:cilsat,项目名称:perisalah-corpus,代码行数:34,代码来源:main.py
示例8: extract
def extract(self, item):
"""Creates an instance of Article without a Download and returns an ArticleCandidate with the results of
parsing the HTML-Code.
:param item: A NewscrawlerItem to parse.
:return: ArticleCandidate containing the recovered article data.
"""
article_candidate = ArticleCandidate()
article_candidate.extractor = self._name()
article = Article('')
article.set_html(item['spider_response'].body)
article.parse()
article_candidate.title = article.title
article_candidate.description = article.meta_description
article_candidate.text = article.text
article_candidate.topimage = article.top_image
article_candidate.author = article.authors
if article.publish_date is not None:
try:
article_candidate.publish_date = article.publish_date.strftime('%Y-%m-%d %H:%M:%S')
except ValueError as exception:
self.log.debug('%s: Newspaper failed to extract the date in the supported format,'
'Publishing date set to None' % item['url'])
article_candidate.language = article.meta_lang
return article_candidate
开发者ID:Sayeedsalam,项目名称:spec-event-data-server,代码行数:27,代码来源:newspaper_extractor.py
示例9: get_article
def get_article(url):
a = Article(url)
a.download()
a.parse()
article = dict()
article['title'] = a.title
article['publish_date'] = a.published_date
article['authors'] = a.authors
article['lead_image'] = a.top_image
article['movies'] = a.movies
article['text'] = a.text
article['keywords'] = get_keywords(a.text)
# This is more likely to fail.
# try:
# article.nlp()
# article['summary'] = 'This summary is generated: \n ' + a.summary
# except Exception:
# print Exception
# article['summary'] = a.summary
return article
开发者ID:allyjweir,项目名称:lackawanna,代码行数:25,代码来源:web_import.py
示例10: insert_url
def insert_url(url):
conn = sqlite3.connect('publico_news_sqllite3.db')
cursor = conn.cursor()
# get the article in plain text
article = Article(url)
article.download()
article.parse()
date = article.publish_date
title = article.title
text = article.text
item = dict()
item['datetime'] = date
item['title'] = title
item['text'] = text
item['category'] = sys.argv[1].split('/')[6]
item['link'] = sys.argv[1]
item['origLink'] = sys.argv[1]
print(item['category'])
print(item['datetime'])
if not duplicate(item, item['category'], cursor):
status = insert_db(item, item['category'], cursor)
if status == 1:
print(sys.argv[1], "inserted")
else:
print("Error", status)
else:
print(url, "already in BD")
conn.commit()
conn.close()
开发者ID:davidsbatista,项目名称:publico.pt-news-scrapper,代码行数:34,代码来源:fetch-news-v4.py
示例11: makeDocs
def makeDocs():
utc = pytz.utc
es = Elasticsearch(BONSAI_URL, verify_certs= True)
es.indices.delete(index='news', ignore=[400, 404])
es.indices.create(index='news', ignore=400)
print "Created"
cnn_paper = newspaper.build(u'http://cnn.com', memoize_articles=False)
a = defaultdict(int)
cnn_articles = cnn_paper.articles
print cnn_paper.size()
for i in range(10):
article = cnn_articles[i]
url = article.url
art = Article(url)
art.download()
art.parse()
print art.publish_date
print art.text
print "Article" + str(i)
print art.publish_date is not None
print art.text is not None
if (art.publish_date is not None) and (art.text is not None):
try:
doc = {
'domain': 'CNN',
'date': utc.localize(art.publish_date),
'text': art.text
}
res = es.index(index="news", doc_type='article', id=i, body=doc)
print "Doc" + str(i)
except:
print "Doc not accepted"
开发者ID:shawncaeiro,项目名称:persoNews,代码行数:33,代码来源:createIndex.py
示例12: test2
def test2(self):
articles =[
'http://www.radionz.co.nz/news/national/281869/seven-arrests-over-police-manhunt',
'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11491573',
'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11580358',
'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11580350',
'http://www.stuff.co.nz/national/crime/75978990/whanganui-woman-accused-of-leaving-child-in-car-overnight.html',
'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11574608',
'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11577923',
'http://www.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11591401',
'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11566180'
]
articles = [
'http://www.express.co.uk/news/uk/657926/New-Zealand-John-Key-slams-David-Cameron-Britain-forgetting-history-European-Union-EU',
'http://www.bbc.co.uk/news/uk-wales-35954982',
'http://www.telegraph.co.uk/news/2016/04/04/david-cameron-will-be-an-excellent-former-prime-minister/',
'http://www.pressandjournal.co.uk/fp/news/aberdeenshire/880519/david-camerons-father-named-panamanian-company-aberdeenshire-home/',
'http://www.theguardian.com/politics/2016/apr/01/senior-tories-brexit-vote-leave-attacks-david-cameron-letter-nhs-staff',
'http://www.dailymail.co.uk/news/article-3519908/Nuclear-drones-threat-British-cities-Cameron-Obama-hold-war-game-session-respond-attack-kill-thousands-people.html',
'http://www.telegraph.co.uk/news/2016/03/31/if-david-cameron-cant-stop-the-tory-fighting-hell-clear-jeremy-c/',
'http://www.manchestereveningnews.co.uk/news/greater-manchester-news/gmp-boost-number-armed-officers-11125178',
'http://www.theguardian.com/commentisfree/2016/apr/03/cameron-headphones-what-is-cool-what-is-not']
with open("./Output2.txt", "w") as text_file:
for url in articles:
print(url)
a = Article(url)
a.download()
a.parse()
text_file.write(a.text.encode('utf-8'))
text_file.write('\n')
开发者ID:boztaskent,项目名称:tensorflow-named-entity,代码行数:32,代码来源:test_data_helpers.py
示例13: runTest
def runTest(self):
# The "correct" fulltext needs to be manually checked
# we have 50 so far
FULLTEXT_PREPARED = 50
domain_counters = {}
with open(URLS_FILE, 'r') as f:
urls = [d.strip() for d in f.readlines() if d.strip()]
for url in urls[:FULLTEXT_PREPARED]:
domain = get_base_domain(url)
if domain in domain_counters:
domain_counters[domain] += 1
else:
domain_counters[domain] = 1
res_filename = domain + str(domain_counters[domain])
html = mock_resource_with(res_filename, 'html')
try:
a = Article(url)
a.download(html)
a.parse()
except Exception:
print('<< URL: %s parse ERROR >>' % url)
traceback.print_exc()
continue
correct_text = mock_resource_with(res_filename, 'txt')
condensed_url = url[:30] + ' ...'
print('%s -- fulltext status: %s' %
(condensed_url, a.text == correct_text))
开发者ID:pombredanne,项目名称:newspaper,代码行数:31,代码来源:unit_tests.py
示例14: get_image
def get_image():
url = request.args.get('url', '')
if not url:
abort(400)
if is_image(url):
return redirect(url)
article = Article(url)
article.download()
try:
article.parse()
except (IOError, UnicodeDecodeError):
return '', 422
try:
top_image = article.top_image.rsplit('?',1)[0]
except AttributeError:
top_image = ''
if not top_image == '':
return redirect(top_image)
else:
return '', 422
开发者ID:mskog,项目名称:cheapskate,代码行数:25,代码来源:cheapskate.py
示例15: f
def f(url):
url_urls = url.text
try:
response = urllib2.urlopen(url_urls)
status = response.code
#print "detected webpage code:", status
if(status == 404):
pass
else:
a_zh = Article(url_urls, language = 'zh')
a_zh.download()
a_zh.parse()
# content_urls = a_zh.text
# if(content_urls == ''):
# a_en = Article(url_urls, language = 'en')
# a_en.download()
# a_en.parse()
# content_urls = content_urls + a_en.text
# if(content_urls != ''):
# pass
# # compare_article(url_urls, content_urls)
except:
pass
开发者ID:shujianbu,项目名称:newshub,代码行数:27,代码来源:detect_cmp.py
示例16: parse_news
def parse_news(self, response):
item = ScrapyGooglenewsItem()
#only log the warning info from request
logging.getLogger("requests").setLevel(logging.WARNING)
for href in response.xpath('//h2[@class="title"]/a/@href').extract():
item['link'] = href
#use newspaper-0.0.8 to scrape the webpage, then get clean text.
article = Article(item['link'])
article.download()
article.parse()
item['title'] = article.title
item['text'] = article.text
#item['authors'] = article.authors
#item['date'] = article.publish_date
if response.url.split('&')[-1] == 'topic=w':
item['domain'] = 'World'
if response.url.split('&')[-1] == 'topic=n':
item['domain'] = 'U.S.'
if response.url.split('&')[-1] == 'topic=b':
item['domain'] = 'Business'
if response.url.split('&')[-1] == 'topic=tc':
item['domain'] = 'Technology'
if response.url.split('&')[-1] == 'topic=e':
item['domain'] = 'Entertainment'
if response.url.split('&')[-1] == 'topic=s':
item['domain'] = 'Sports'
if response.url.split('&')[-1] == 'topic=snc':
item['domain'] = 'Science'
if response.url.split('&')[-1] == 'topic=m':
item['domain'] = 'Health'
yield item
开发者ID:jyb002,项目名称:Scrapy_GoogleNews,代码行数:34,代码来源:spider.py
示例17: scrapeURLS
def scrapeURLS(inFilPath):
texts = []
cache = loadCache()
toDelURLs = []
with open(inFilPath) as f:
urls = f.readlines()
for url in urls:
if filter(urlFilters, url):
toDelURLs.append(url)
if url in cache:
txt = cache[url]
else:
print "Scraping URL %s" % url
article = Article(url)
article.download()
article.parse()
txt = article.text.replace("\n", " ").replace(" ", " ").strip()
if txt == "" or filter(txtFilter, txt):
toDelURLs.append(url)
continue
cacheURL(url, txt)
texts.append(txt)
deleteURLs(inFilPath, toDelURLs)
return texts
开发者ID:pjdrm,项目名称:EM_pjdrm,代码行数:25,代码来源:nlp.py
示例18: get_article_by_url
def get_article_by_url(url):
article = Article(url, fetch_images=False)
article.download()
if url == "empty":
return "nolist"
article.parse()
return article.text
开发者ID:AndreyPrvt,项目名称:Article-parser,代码行数:7,代码来源:RSSFeedParser.py
示例19: check_url
def check_url(args):
"""
:param (basestr, basestr) url, res_filename:
:return: (pubdate_failed, fulltext_failed)
"""
url, res_filename = args
pubdate_failed, fulltext_failed = False, False
html = mock_resource_with(res_filename, 'html')
try:
a = Article(url)
a.download(html)
a.parse()
if a.publish_date is None:
pubdate_failed = True
except Exception:
print('<< URL: %s parse ERROR >>' % url)
traceback.print_exc()
pubdate_failed, fulltext_failed = True, True
else:
correct_text = mock_resource_with(res_filename, 'txt')
if not (a.text == correct_text):
# print('Diff: ', simplediff.diff(correct_text, a.text))
# `correct_text` holds the reason of failure if failure
print('%s -- %s -- %s' %
('Fulltext failed',
res_filename, correct_text.strip()))
fulltext_failed = True
# TODO: assert statements are commented out for full-text
# extraction tests because we are constantly tweaking the
# algorithm and improving
# assert a.text == correct_text
return pubdate_failed, fulltext_failed
开发者ID:Newspad,项目名称:newspaper,代码行数:32,代码来源:unit_tests.py
示例20: parse_input
def parse_input(text, extractor='newspaper'):
if isinstance(text, str) or isinstance(text, unicode):
if text.startswith(('http://', 'https://')):
# Input is a link - need to extract the text from html
if extractor.lower() == 'goose':
from goose import Goose
urlparse = Goose()
article = urlparse.extract(url=text)
return unicode_to_ascii(article.cleaned_text)
else:
from newspaper import Article
article = Article(text)
article.download()
article.parse()
return unicode_to_ascii(article.text)
elif text.endswith('.txt'):
# Input is a file - need to read it
textfile = open(text, 'rb')
article = textfile.read()
textfile.close()
return unicode_to_ascii(article)
else:
# Input is a string containing the raw text
return unicode_to_ascii(text)
else:
raise ValueError('Input text must be of type str or unicode.')
开发者ID:Anhmike,项目名称:PyTLDR,代码行数:26,代码来源:preprocess.py
注:本文中的newspaper.Article类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论