• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

Python newspaper.build函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中newspaper.build函数的典型用法代码示例。如果您正苦于以下问题:Python build函数的具体用法?Python build怎么用?Python build使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了build函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: get_articles

def get_articles():
	# get Chinese articles from domain
	for url in open("list_ch.txt", 'r'):
		try: 
			paper = newspaper.build(url, memoize_articles = True, language = 'zh')
			match_object = re.search('http\:\/\/([^\/]+)\/', url)
			domain = match_object.group(1)

			for article in paper.articles:
				get_meta(article, domain)

		except:
			pass


	# get English articles from domain
	for url in open("list_en.txt", 'r'):
		try:
			paper = newspaper.build(url, memoize_articles = True, language = 'en')
			match_object = re.search('http\:\/\/([^\/]+)\/', url)
			domain = match_object.group(1)

			for article in paper.articles:
				get_meta(article, domain)

		except:
			pass


	# get articles from RSS
	for url in open("list_rss_ch.txt", 'r'):
		try:
			feed = feedparser.parse(url)
			match_object = re.search('http\:\/\/([^\/]+)\/', url)
			domain = match_object.group(1)
			chinese = True

			for post in feed.entries:
				link = post.link
				get_meta_rss(link, domain, chinese)

		except:
			pass

	for url in open("list_rss_en.txt", 'r'):
		try:
			feed = feedparser.parse(url)
			match_object = re.search('http\:\/\/([^\/]+)\/', url)
			domain = match_object.group(1)
			chinese = False

			for post in feed.entries:
				link = post.link
				get_meta_rss(link, domain, chinese)

		except:
			pass

	print "success!"
	return
开发者ID:BersaKAIN,项目名称:newshub,代码行数:60,代码来源:fetch.py


示例2: CheckForMoreArticles

def CheckForMoreArticles():
    print 'Checking for more articles from CNN'
    cnn = newspaper.build(u'http://us.cnn.com/')
    print 'Found ' + str(cnn.size()) + ' new articles from CNN'
    print 'Checking for more articles from SMH'
    smh = newspaper.build(u'http://smh.com.au/')
    print 'Found ' + str(smh.size()) + ' new articles from SMH'
    print 'Checking for more articles from Slashdot'
    slashdot = newspaper.build(u'http://slashdot.org/')
    print 'Found ' + str(smh.size()) + ' new articles from SlashDot'
    print 'Checking for more articles from BBC'
    bbc = newspaper.build(u'http://www.bbc.com/')
    print 'Found ' + str(smh.size()) + ' new articles from BBC'
    return cnn.articles + smh.articles + slashdot.articles + bbc.articles
开发者ID:NyxTheDarkness,项目名称:News-Parser,代码行数:14,代码来源:script.py


示例3: get_article_urls

 def get_article_urls(self, rclient, source_url):
     paper = newspaper.build(
         source_url, memoize_articles=False, fetch_images=False,
         request_timeout=self.timeout, number_threads=self.threads,
         language=self.language, browser_user_agent=self.user_agent)
     urls = ((a.url, a.title) for a in paper.articles[:self.max_articles])
     return ifilterfalse(lambda x: rclient.exists(x[0]), urls)
开发者ID:martinthenext,项目名称:criticalsyncing,代码行数:7,代码来源:fetcher.py


示例4: populate_sites

def populate_sites(sites):
    """ (list of str) -> list of [str, newspaper.source.Source]
    Parses through the sites using newspaper library and
    returns list of sites with available articles populated

    Keyword arguments:
    sites         -- List of [name, url] of each site
    """
    new_sites = []
    for s in range(len(sites)):
        # Check for any new command on communication stream
        check_command()
        # Duplicate the name of the sites
        new_sites.append([sites[s][0]])
        # Use the url and populate the site with articles
        new_sites[s].append(
            (
                newspaper.build(
                    sites[s][1],
                    memoize_articles=False,
                    keep_article_html=True,
                    fetch_images=False,
                    language="en",
                    number_threads=1,
                )
            )
        )
        # Append site url
        new_sites[s].append(sites[s][1])
    return new_sites
开发者ID:KanwarGill,项目名称:team04-Project,代码行数:30,代码来源:article_explorer.py


示例5: readArticleCollectionFile

def readArticleCollectionFile(site, filename, c):
	f = open(filename, 'w')

	paper = newspaper.build(site, memoize_articles=False)

	print len(paper.articles)

	i = 0
	for article in paper.articles:
		article.download()
		article.parse()

		title = article.title.encode('ascii', 'ignore')
		text = article.text.encode('ascii', 'ignore')

		#article.nlp()
		#keywords = article.keywords
		#summary = article.summary.encode('ascii', 'ignore')

		f.write('<article>\n')
		f.write("<class>" + str(c) + "</class>\n")
		f.write('<title>' + title + '</title>\n')
		f.write('<text>\n' + text + '</text>\n')
		#f.write('<keywords>' + str(keywords) + '</keywords>\n')
		#f.write('<summary>' + summary + '</summary>\n')
		f.write("</article>\n")
		i = i + 1
		if i > 40:
			break
	f.close()
开发者ID:mataevs,项目名称:news-analysis,代码行数:30,代码来源:article_parse.py


示例6: fetch_article_url

    def fetch_article_url(self, memoize=False):
        paper = newspaper.build(self.url, memoize_articles=memoize) or []
        self.narticles = paper.size()
        print 'article count:%s' % self.narticles
        pipe = redis.pipeline()
        date_fmt = r'\d{4}[-/]\d{2}[-/]\d{2}'
        for article in paper.articles:
            url = article.url
            print url
            date_keys = re.findall(date_fmt, url)
            if not date_keys:
                continue

            date_key = date_keys[0]
            key = self.key(date_key)

            pipe.sadd(key, url)

            if self.save and date_key in self.get_valid_days():
                print 'processing....'
                try:
                    article.download()
                    article.parse()
                    key = self.key(date_key, article.title)
                    pipe.set(key, article.text)
                except:
                    pass
               
        pipe.execute()
开发者ID:brenden17,项目名称:Realtime-Trend-with-Flask-on-Redis,代码行数:29,代码来源:resource_for_word2vec.py


示例7: _get_articles

 def _get_articles(url):
     url = url.strip()
     for file in os.listdir(newspaper.settings.ANCHOR_DIRECTORY):  # clearing newspaper categories cache
         os.unlink(os.path.join(newspaper.settings.ANCHOR_DIRECTORY, file))
     articles = newspaper.build(url).articles
     if url.split('.')[1] == 'jetbrains':  # at least for now. Newspaper is a bit buggy on JetBrains site
         articles = []
         for page in range(10):
             soup = BeautifulSoup(requests.get(url + '/page/' + str(page)).content, 'html.parser')
             for title in soup.find_all('h2', {'class': 'entry-title'}):
                 articles.append(NewspaperArticle(title.find('a').get('href')))
     for article in articles:
         article.download()
         if not article.is_downloaded:
             print("Failed to download article:", article.url)
             continue
         article.parse()
         article.nlp()
         publish_date = article.publish_date
         if publish_date is None and url.split('.')[1] == 'jetbrains':
             soup = BeautifulSoup(requests.get(article.url).content, 'html.parser')
             publish_date = soup.find('span', {'class': 'entry-date'}).getText()
             # actually, newspaper is very buggy on JetBrains blog and often cannot parse publish date
         print(publish_date)
         yield DataMiningArticle(article.html, article.title, article.summary, article.text,
                                 "", article.canonical_link, "", publish_date)
开发者ID:tehnar,项目名称:Event-extraction,代码行数:26,代码来源:blogs_parser.py


示例8: validate_site

def validate_site(site):
    try:
        s = newspaper.build(site, memoize_articles=False, keep_article_html=True, fetch_images=False, language="en")
        if s.size() == 0:
            raise ValidationError("%s is not a valid Referring Site!" % site)
    except:
        raise ValidationError("%s is not a valid Referring Site!" % site)
开发者ID:wangx173,项目名称:Voyage,代码行数:7,代码来源:models.py


示例9: Calculate

def Calculate():
	try:
		news = request.form['inputNews'].lower()
		topic = request.form['inputTopic']
		category = request.form['inputCategory']

		print news + "\t" + topic + "\t" + category
		
		from havenondemand.hodindex import HODClient
		client = HODClient(apikey='6b1f8438-56c7-45e0-98a6-6742c1be0d65', apiversiondefault=1)

		"""def get_bias(url):
			print "Hello"
			data = {'url': url}
			r = client.post('analyzesentiment', data)
			sentiment = r.json()['aggregate']['sentiment']
			score = r.json()['aggregate']['score']
			print url + " | " + sentiment + " | " + str(score)
			return score"""

		paper = newspaper.build("http://" + news + ".com", language='en', memoize_articles=False)

		url = []

		for article in paper.articles:
			url.append(article.url)

		cumulative_score = 0.0
		countNegative = 0
		countPositive = 0
		countNeutral = 0

		"""import multiprocessing as mp

		p = mp.Pool(3)
		res = p.map(get_bias, url)"""

		print newspaper.category

		for u in url:
			data = {'url': u}
			r = client.post('analyzesentiment', data)
			sentiment = r.json()['aggregate']['sentiment']
			score = r.json()['aggregate']['score']
			print u + " | " + sentiment + " | " + str(score)
			cumulative_score += score
			if sentiment == 'positive':
				countPositive += 1
			elif sentiment == 'negative':
				countNegative += 1
			elif sentiment == 'neutral':
				countNeutral += 1				

		print cumulative_score
		print cumulative_score/len(url)

	except Exception as e:
		return json.dumps({'error':str(e)})

	return news + topic + category
开发者ID:ShwetanshuSingh,项目名称:bias-measure,代码行数:60,代码来源:trial.py


示例10: discover_feeds_urls

def discover_feeds_urls(feed_url):
    """ Try to discover more feed URLs in one. """

    LOGGER.info(u'Trying to discover new RSS/Atom feeds from %s…', feed_url)

    try:
        site = newspaper.build(feed_url)

        urls_to_try = set(site.feed_urls())

    except:
        LOGGER.exception(u'Newspaper did not help finding feeds '
                         u'from “%s”', feed_url)

    created = []
    known = []

    for url in urls_to_try:
        result = create_feeds_from_url(url, recurse=False)

        if result:
            # keep feeds if they have been created
            created.extend(x[0] for x in result if x[1])
            known.extend(x[0] for x in result if not x[1])

    LOGGER.info(u'Done discovering %s: %s feeds created, %s already known.',
                feed_url, len(created), len(known))
开发者ID:1flow,项目名称:1flow,代码行数:27,代码来源:rssatom.py


示例11: get_news_data

def get_news_data():
    # Get list of settings
    urllist: SettingsList = get_safe_settingslist('CryptoNewsUrls', urls)
    keylist: SettingsList = get_safe_settingslist('CrytoNewsKeywords', keywords)

    logger_name = 'main_scraper.' + "bitcoin_news"
    logger = logging.getLogger(logger_name)

    for url in urllist.list:
        paper = newspaper.build(url, language='en')
        for article in paper.articles:
            try:
                article.download()
                article.parse()

                keys = [key for key in keylist.list if key in article.title.lower()]
                if len(keys) > 0:
                    # check if article already exists
                    obj = CryptoNews.objects(title=article.title).first()
                    if obj is None:
                        news = CryptoNews()
                        news.title = article.title
                        news.description = article.meta_description
                        news.text = article.text
                        news.tags = keys
                        news.url = article.url
                        news.save()
                        logger.info(article.title)

            except BaseException as e:
                logger.error('Cryptonews error{0}'.format(e))
                pass
开发者ID:dangraf,项目名称:PycharmProjects,代码行数:32,代码来源:cryptonews_scraper.py


示例12: build_newspaper

 def build_newspaper(self):
     '''
     This method builds newspaper using their url and newspaper library
     '''
     for site_url in self.site_urls:
         self.built_newspapers.append(newspaper.build(site_url,
                                                    memoize_articles=False))
开发者ID:AndriyZ,项目名称:web_scrapper,代码行数:7,代码来源:scrapper_container.py


示例13: makeDocs

def makeDocs():
    utc = pytz.utc
    es = Elasticsearch(BONSAI_URL, verify_certs= True)
    es.indices.delete(index='news', ignore=[400, 404])
    es.indices.create(index='news', ignore=400)

    print "Created"
    cnn_paper = newspaper.build(u'http://cnn.com', memoize_articles=False)
    a = defaultdict(int)
    cnn_articles = cnn_paper.articles
    print cnn_paper.size()
    for i in range(10):
        article = cnn_articles[i]
        url = article.url
        art = Article(url)
        art.download()
        art.parse()
        print art.publish_date
        print art.text
        print "Article" + str(i)
        print art.publish_date is not None
        print art.text is not None
        if (art.publish_date is not None) and (art.text is not None):
            try:
                doc = {
                'domain': 'CNN',
                'date': utc.localize(art.publish_date), 
                'text': art.text
                }
                res = es.index(index="news", doc_type='article', id=i, body=doc)
                print "Doc" + str(i)
            except:
                print "Doc not accepted"
开发者ID:shawncaeiro,项目名称:persoNews,代码行数:33,代码来源:createIndex.py


示例14: getArticlesFromSource

	def getArticlesFromSource(self, source, external=False):
		paper = newspaper.build(source, memoize_articles=True, browser_user_agent='BEEVA/Emojinews crawler 1.0')
		#Filtering articles out of domain
		news = filter((lambda x: x.url.startswith(source) or external), paper.articles)
		news = map(self.cleanNames, news)
		news = filter((lambda x: x.title), paper.articles)
		news = map(lambda x: {'url':x.url, 'title':x.title}, news)
		return news
开发者ID:beeva-labs,项目名称:emojinews,代码行数:8,代码来源:feed.py


示例15: main

def main():
    source_url, rabbit_url = parse_config()
    paper = newspaper.build(source_url)
    publisher = Publisher(
        rabbit_url=rabbit_url,
        publish_interval=0.25,
        article_urls=paper.article_urls())
    publisher.run()
开发者ID:projectweekend,项目名称:Article-Collector,代码行数:8,代码来源:collector.py


示例16: get_newspapers

def get_newspapers(source_urls):
    papers = []
    for url in source_urls:
        papers.append(newspaper.build(url))
    
    news_pool.set(papers, threads_per_source=2)
    news_pool.join()
    return papers
开发者ID:projectweekend,项目名称:Pi-Holly,代码行数:8,代码来源:newspaper_helpers.py


示例17: get_article_text

def get_article_text(link):

    text = []
    newspapr = newspaper.build(link)
    for article in newspapr.articles:
      # print(article.url)
      text.append(article.text)

    return text
开发者ID:hamza765,项目名称:Political-Bias-Predictor,代码行数:9,代码来源:web_crawler.py


示例18: RefreshArticles

def RefreshArticles(domain, directory, personality, log=Print, timeout=None):
    start_time = time.time()
    arts = np.build(domain, language='en', memoize_articles=False).articles
    log(domain + " has %d articles" % len(arts))
    for art in arts:
        if not timeout is None and time.time() - start_time > timeout:
            log("Timeout after %f secons" % (time.time() - start_time))
            return
        DownloadAndProcess(url, directory, personality, log=log)
开发者ID:jvictor0,项目名称:TiaraBoom,代码行数:9,代码来源:article_rat.py


示例19: extract_articles

def extract_articles(news_source='http://cnn.com',num=10):
    news_source = newspaper.build(news_source, memoize_articles=False)
    output=[]
    for art in news_source.articles[:min(num,len(news_source.articles))]:
        art.download()
        art.parse()
        raw=art.text
        output=output+nltk.word_tokenize(raw)
    return output
开发者ID:ggomarr,项目名称:NLTK,代码行数:9,代码来源:scrapbook_ch_04.py


示例20: test_download_works

    def test_download_works(self):
        config = Configuration()
        config.memoize_articles = False
        slate_paper = newspaper.build('http://slate.com', config=config)
        tc_paper = newspaper.build('http://techcrunch.com', config=config)
        espn_paper = newspaper.build('http://espn.com', config=config)

        print ('slate has %d articles tc has %d articles espn has %d articles'
               % (slate_paper.size(), tc_paper.size(), espn_paper.size()))

        papers = [slate_paper, tc_paper, espn_paper]
        news_pool.set(papers, threads_per_source=2)

        news_pool.join()

        print 'Downloaded slate mthread len', len(slate_paper.articles[0].html)
        print 'Downloaded espn mthread len', len(espn_paper.articles[-1].html)
        print 'Downloaded tc mthread len', len(tc_paper.articles[1].html)
开发者ID:Geekking,项目名称:newspaper,代码行数:18,代码来源:unit_tests.py



注:本文中的newspaper.build函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python newspaper.Article类代码示例发布时间:2022-05-27
下一篇:
Python helpers.render_template函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap