• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

Python html.remove_tags函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中w3lib.html.remove_tags函数的典型用法代码示例。如果您正苦于以下问题:Python remove_tags函数的具体用法?Python remove_tags怎么用?Python remove_tags使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了remove_tags函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: parse

	def parse(self, response):
		ts = time.time()
		html_name = 'txt/artist/artist' + str(ts) + '.txt'
		file = codecs.open(html_name, 'w+', 'utf-8')
		
		# file.write(response.url)
		# file.write('\n')

		for body in response.css('div.layoutSingleColumn h3').extract():
			body = body.encode(response.encoding)
			body = remove_tags(body)
			print "Header"
			print(body)
			try:
				file.write(body)
			except AttributeError:
				print(AttributeError)
				sys.exit(0)

		for body in response.css('div.layoutSingleColumn p').extract():
			body = body.encode(response.encoding)
			body = remove_tags(body)
			print "Paragraph"
			print(body)
			try:
				file.write(body)
			except AttributeError:
				print(AttributeError)
				sys.exit(0)

		file.close()
开发者ID:tomlai19852004,项目名称:DSPersonal,代码行数:31,代码来源:artist.py


示例2: test_remove_tags

    def test_remove_tags(self):
        # text with tags
        self.assertEqual(remove_tags(u'<p>one p tag</p>'), u'one p tag')
        self.assertEqual(remove_tags(u'<p>one p tag</p>', which_ones=('b',)), u'<p>one p tag</p>')

        self.assertEqual(remove_tags(u'<b>not will removed</b><i>i will removed</i>', which_ones=('i',)),
                         u'<b>not will removed</b>i will removed')
开发者ID:Preetwinder,项目名称:w3lib,代码行数:7,代码来源:test_html.py


示例3: parse

	def parse(self, response):
		max_position = ''
		koma = ','
		headers = response.headers
		itemselector = Selector(response).xpath('//div[@class="content"]')

		if headers['Content-Type'] == 'application/json;charset=utf-8':
			data = json.loads(response.body)
			itemselector = Selector(text=data['items_html']).xpath('//div[@class="content"]')
			max_position = data['min_position']
			yield Request("https://twitter.com/i/search/timeline?f=tweets&vertical=default&q=%22demam%20berdarah%22%20OR%20dbd%20OR%20dhf%20OR%20%22dengue%20fever%22%20OR%20%22dengue%20hemorrhagic%22%20OR%20%22sakit%20db%22%20lang%3Aid%20since%3A"+self.start+"%20until%3A"+self.end+"&src=typd&include_available_features=1&include_entities=1&max_position="+max_position+"&reset_error_state=false", 
					callback=self.parse, 
					method="GET",)
		
		for sel in itemselector:
			self.index += 1
			item = TwitterscrapingItem()
			item['index'] = self.index
			item['userid'] = ''.join(
				map(unicode.strip, sel.xpath('div[@class="stream-item-header"]/a/@data-user-id').extract()))
			item['username'] = ''.join(
				map(unicode.strip, sel.xpath('div[@class="stream-item-header"]/a/span[@class="username js-action-profile-name"]/b/text()').extract()))
			item['fullname'] = ''.join(
				map(unicode.strip, sel.xpath('div[@class="stream-item-header"]/a/strong/text()').extract()))
			text_tweet = ''.join(
				map(unicode.strip, sel.xpath('p[@class="TweetTextSize  js-tweet-text tweet-text"]').extract()))
			item['text_tweet'] = remove_tags(text_tweet).replace('\n',' ').replace('\u',' ')
			item['original_text_tweet'] = text_tweet
			hash_tags = koma.join(
				map(unicode.strip, sel.xpath('p[@class="TweetTextSize  js-tweet-text tweet-text"]'
					'/a[@class="twitter-hashtag pretty-link js-nav"]').extract()))
			item['hash_tags'] = remove_tags(hash_tags)
			item['time_tweet'] = ''.join(
				map(unicode.strip, sel.xpath('div[@class="stream-item-header"]/small[@class="time"]/a/@title').extract()))
			item['lang'] = ''.join(
				map(unicode.strip, sel.xpath('p[@class="TweetTextSize  js-tweet-text tweet-text"]/@lang').extract()))
			retweets = ''.join(
				map(unicode.strip, sel.xpath('div[@class="stream-item-footer"]'
					'/div[@class="ProfileTweet-actionList js-actions"]'
					'/div[@class="ProfileTweet-action ProfileTweet-action--retweet js-toggleState js-toggleRt"]'
					'/button[@class="ProfileTweet-actionButton  js-actionButton js-actionRetweet"]'
					'/div[@class="IconTextContainer"]').extract()))
			item['retweets'] = remove_tags(retweets).strip()
			favorite = ''.join(
				map(unicode.strip, sel.xpath('div[@class="stream-item-footer"]'
					'/div[@class="ProfileTweet-actionList js-actions"]'
					'/div[@class="ProfileTweet-action ProfileTweet-action--favorite js-toggleState"]'
					'/button[@class="ProfileTweet-actionButton js-actionButton js-actionFavorite"]'
					'/div[@class="IconTextContainer"]').extract()))
			item['favorite'] = remove_tags(favorite).strip()
			item['place_id'] = ''.join(
				map(unicode.strip, sel.xpath('div[@class="stream-item-header"]/span[@class="Tweet-geo u-floatRight js-tooltip"]/a/@data-place-id').extract()))	
			item['place'] = ''.join(
				map(unicode.strip, sel.xpath('div[@class="stream-item-header"]/span[@class="Tweet-geo u-floatRight js-tooltip"]/a/span[@class="u-hiddenVisually"]/text()').extract()))	
			item['max_position'] = max_position

			yield item
开发者ID:limaginaire,项目名称:twitterscrapy,代码行数:57,代码来源:twitter_spider.py


示例4: test_returns_unicode

 def test_returns_unicode(self):
     # make sure it always return unicode
     assert isinstance(remove_tags(b'no tags'), six.text_type)
     assert isinstance(remove_tags(b'no tags', which_ones=('p',)), six.text_type)
     assert isinstance(remove_tags(b'<p>one tag</p>'), six.text_type)
     assert isinstance(remove_tags(b'<p>one tag</p>', which_ones=('p')), six.text_type)
     assert isinstance(remove_tags(b'<a>link</a>', which_ones=('b',)), six.text_type)
     assert isinstance(remove_tags(u'no tags'), six.text_type)
     assert isinstance(remove_tags(u'no tags', which_ones=('p',)), six.text_type)
     assert isinstance(remove_tags(u'<p>one tag</p>'), six.text_type)
     assert isinstance(remove_tags(u'<p>one tag</p>', which_ones=('p')), six.text_type)
     assert isinstance(remove_tags(u'<a>link</a>', which_ones=('b',)), six.text_type)
开发者ID:Preetwinder,项目名称:w3lib,代码行数:12,代码来源:test_html.py


示例5: parse

 def parse(self, response):
     hrefs = response.selector.xpath('//div[contains(@class, "c-container")]/h3/a/@href').extract()
     containers = response.selector.xpath('//div[contains(@class, "c-container")]')
     for container in containers:
         href = container.xpath('h3/a/@href').extract()[0]
         title = remove_tags(container.xpath('h3/a').extract()[0])
         c_abstract = container.xpath('div/div/div[contains(@class, "c-abstract")]').extract()
         abstract = ""
         if len(c_abstract) > 0:
             abstract = remove_tags(c_abstract[0])
         request = scrapy.Request(href, callback=self.parse_url)
         request.meta['title'] = title
         request.meta['abstract'] = abstract
         yield request
开发者ID:296245482,项目名称:ChatBotCourse,代码行数:14,代码来源:baidu_search.py


示例6: parse

	def parse(self, response):
		s = Selector(response)
		next_link = s.xpath('//div[@class="w-button-more"]/a/@href').extract()
		if len(next_link):
			yield Request("https://mobile.twitter.com"+next_link[0], callback=self.parse)
		itemselector = Selector(response).xpath('//*[@id="main_content"]/div/div[3]/table')
		#regex = re.compile(r"([\\]+u\d*)", re.MULTILINE)
		for sel in itemselector:
			self.index += 1
			item = TwitterscrapingItem()
			item['index'] = self.index
			item['username'] = ''.join(
				map(unicode.strip, sel.xpath('tr[1]/td[2]/a/div/text()').extract()))
			tweet = remove_tags(''.join(
				map(unicode.strip, sel.xpath('tr[2]/td/div').extract()))
				).replace('&amp','&').replace('  ','').replace('\n      ','').replace('\n    ','').replace('\n','').replace('\u',' ')
			item['text_tweet'] = u''+tweet
			item['original_tweet'] = ''.join(sel.xpath('tr[2]/td/div/div').extract())
			item['time_tweet'] = ''.join(
				map(unicode.strip, sel.xpath('tr[1]/td[3]/a/text()').extract()))
			item['url'] = ''.join(
				map(unicode.strip, sel.xpath('tr[2]/td/div/@data-id').extract()))
			item['data_id'] = ''.join(
				map(unicode.strip, sel.xpath('tr[3]/td/span[1]/a/@href').extract()))
			yield item
开发者ID:pwcahyo,项目名称:tweetmobile,代码行数:25,代码来源:twitter_spider.py


示例7: _extract_features

 def _extract_features(self, sel, item):
     description_xpath = '//div[@id="tab1"]/ul/li'
     data = sel.xpath(description_xpath).extract()
     if len(data) > 0 :
         data = [remove_tags(v).strip().replace('&nbsp;',' ').replace('&gt;','>').strip()  for v in data]
         data = filter(None,data)
         item['description'] = '<br>'.join(data)
开发者ID:revotu,项目名称:dev-crawler,代码行数:7,代码来源:ericdress.py


示例8: parse_speech

    def parse_speech(self, response):
        paragraphs = response.css('p')[:-1]  # last p contains pagination
        text = remove_tags(''.join(paragraphs.extract()))

        l = ParlamentHuSpeechLoader(item=Speech(), selector=response,
            scheme='parlament.hu/people')
        l.add_value('text', text)
        l.add_value('type', 'speech')
        l.add_value('sources', [response.url])
        l.add_xpath('position', '//b[1]/text()')
        l.add_xpath('video', '//table//tr[6]//td[2]/a/@href')
        l.add_xpath('creator_id', '//table//tr[2]//td[2]/a/@href',
            re=r'ogy_kpv\.kepv_adat\?p_azon=(\w\d+)')
        l.add_value('event_id', response.meta['event_id'])

        date = response.xpath(
            '//table//tr[1]/th/text()').re(r'\d{4}\.\d{2}.\d{2}\.')
        time = response.meta.get('time')
        if date:
            date = date[0]
            if time:
                date += time[0]
            l.add_value('date', date)
        item = l.load_item()
        yield item
        if 'creator_id' in item:
            yield scrapy.Request(self.get_api_url(
                self.PERSON_ENDPOINT, params={
                    'p_azon': item['creator_id']['identifier']}),
                callback=self.parse_person, meta={
                    'p_azon': item['creator_id']['identifier']})
开发者ID:TransparenCEE,项目名称:parldata-scrapers-poland-hungary-montenegro,代码行数:31,代码来源:parlament_hu.py


示例9: make_it_clean

def make_it_clean(line):
	'''
	Очистка текста от тегов html, css стилей, js
	(string) line - входной текст
	'''
	cleari = remove_tags(line)
	soline = re.compile("(\<.+\n)", re.DOTALL)
	boline = re.compile("(.+\>)", re.DOTALL)
	alline = re.compile("\<.+\>", re.DOTALL)
	cleari = re.sub(soline, '', cleari)
	cleari = re.sub(boline, '', cleari)
	cssline = re.compile(r"\{.+\}{1}", re.DOTALL)
	cleari = re.sub(cssline, ' ', cleari)
	cleari = re.sub("async=\"async\"\n", '', cleari)
	cleari = re.sub("src=.+\"", '', cleari)
	cleari = re.sub("var\s_.+\)", '', cleari)
	cleari = re.sub("function.+\"\)", '', cleari)
	cleari = re.sub("document.+\);", " ", cleari)
	cleari = re.sub("function.+\)", " ", cleari)
	cleari = re.sub("&laquo;", " «", cleari)
	cleari = re.sub("&raquo;", "» ", cleari)
	cleari = re.sub("&rarr;", "→", cleari)
	cleari = re.sub(r'&nbsp;', ' ', cleari)
	cleari = re.sub(r'(&mdash;)|(&ndash;)', '-', cleari)
	cleari = re.sub(r'\t{2,}', ' ', cleari)
	cleari = re.sub(r'\s{2,}', ' ', cleari)
	cleari = re.sub(r'\n{2,}', '\n', cleari)
	cleari = re.sub(r"(\<\!\-\-.*\-\-\>)", '', cleari)

	return cleari
开发者ID:haniani,项目名称:hse_zadanie3,代码行数:30,代码来源:3parts.py


示例10: parse_item

    def parse_item(self, response):

        province = response.css('.dqwz>a:last-child::attr(title)').re_first(ur'2017年(.+?)省?本科')
        school = response.css('.nr>h2::text').extract_first()
        count = len(response.xpath('//div[@id="ivs_content"]/table//tr[1]/td').extract())
        for row in response.xpath('//div[@id="ivs_content"]/table//tr[position()>1]'):
            fields = [remove_tags(i).strip() for i in row.css('td').extract()]
            if count == 4:
                del fields[0]
            if len(fields) == 3:
                rowspan_count = [e.css('::attr(rowspan)').extract_first(1) for e in row.css('td')][-3:]
                rowspan_value = fields
                rowspans = len([i for i in rowspan_count if i > 1])
            elif len(fields) + rowspans == 3:
                new_fields = []
                fields.reverse()
                for k, v in zip(rowspan_count, rowspan_value):
                    if k == 1:
                        new_fields.append(fields.pop())
                    else:
                        new_fields.append(v)
                fields = new_fields
            else:
                continue

            yield ShmecItem(
                province=province,
                school=school,
                major=fields[0],
                require=fields[1],
                remark=fields[2],
            )
开发者ID:EasyData,项目名称:gaokao,代码行数:32,代码来源:shmec.py


示例11: parse_linklist

def parse_linklist(text, remove_tags=False):
    data = []

    for row in text.split('\n'):
        rowparts = row.strip().split(' ')
        if len(rowparts) < 2:
            break
        time = rowparts[0]
        if rowparts[1].startswith('<') and rowparts[1].endswith('>'):
            url = rowparts[1][1:-1]
            textparts = rowparts[2:]
        else:
            url = ''
            textparts = rowparts[1:]
        text = ' '.join(textparts)
        if remove_tags:
            text = html.remove_tags(text)
        data.append(
            {
                'time': time,
                'url': url,
                'text': text
            }
        )
    return data
开发者ID:erlehmann,项目名称:redokast,代码行数:25,代码来源:__init__.py


示例12: _extract_description

 def _extract_description(self, sel, item):
     return
     desc_xpath = '//div[@id="item-overview"]/ul/li/node()'
     data = sel.xpath(desc_xpath).extract()
     if len(data) != 0:
         data = [remove_tags(v.strip()) for v in data]
         description = ';'.join(data).replace(':;',':').replace('from;','from ')
         item['description'] = description
开发者ID:revotu,项目名称:dev-crawler,代码行数:8,代码来源:etsy.py


示例13: _extract_links

    def _extract_links(self, response_text, response_url, response_encoding):
        base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url

        clean_url = lambda u: urljoin_rfc(base_url, remove_entities(clean_link(u.decode(response_encoding))))
        clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip()

        links_text = linkre.findall(response_text)
        urlstext = set([(clean_url(url), clean_text(text)) for url, _, text in links_text])

        return [Link(url, text) for url, text in urlstext]
开发者ID:bihicheng,项目名称:scrapy,代码行数:10,代码来源:regex.py


示例14: clean_tags_from_affiliations

def clean_tags_from_affiliations(value):
    """Clean the affiliaton string for an author."""
    for affiliation in value.get('affiliations', []):
        # Remove tag AND content of any prefix like <label><sup>1</sup></label>
        affiliation['value'] = remove_tags_with_content(affiliation['value'], ('label',))
        # Now remove all tags but KEEP content
        affiliation['value'] = remove_tags(affiliation['value'])
        # Remove random whitespaces
        affiliation['value'] = clean_whitespace_characters(affiliation['value'])
    return value
开发者ID:gitter-badger,项目名称:hepcrawl,代码行数:10,代码来源:inputs.py


示例15: process_response

    def process_response(self, request, response, spider):
        # clean body
        orig_body = response.body_as_unicode()
        body = remove_tags_with_content(orig_body, which_ones=('script', 'head'))
        body = remove_tags(remove_comments(body))
        terms = tokenize(body.lower())
        request.meta['terms'] = terms
        request.meta['body'] = body

        return response
开发者ID:JM-YE,项目名称:recipecrawler,代码行数:10,代码来源:termsextractor.py


示例16: parse

    def parse(self, response):
        """
        Parse the response page
        """
        # Skip error URLs
        if response.status != 200:
            return

        data = json.loads(response.text)

        title = data['title']
        # Replace / with a space - creates issues with writing to file
        title = title.replace('/', ' ')

        description = data['description']
        data = data['content']

        # Remove <script>, <sup>, <math> tags with the content
        paragraph = remove_tags_with_content(data, which_ones=('script', 'sup', 'math', 'style'))
        # Remove the rest of the tags without removing the content
        paragraph = remove_tags(paragraph)

        # Replace &amp; with &
        paragraph = paragraph.replace('&amp;', '&')
        # Replace &#39; with '
        paragraph = paragraph.replace('&#39;', "'")
        paragraph = paragraph.replace('&rsquo;', "'")
        paragraph = paragraph.replace('&ldquo;', "'")
        paragraph = paragraph.replace('&rdquo;', "'")
        # Replace &nbsp; with a space
        paragraph = re.sub("&.....;", ' ', paragraph)
        paragraph = re.sub("&....;", ' ', paragraph)

        # Replace 'U.S.' with 'US':
        paragraph = paragraph.replace('U.S.', 'US')

        # Some more replacements to improve the default tokenization
        paragraph = paragraph.replace('\r', '')
        paragraph = paragraph.replace('\t', '')

        text = title + '\n\n' + description + '\n\n' + paragraph

        # Create the directory
        dirname = 'data/qplum'
        if not os.path.exists(dirname):
            os.makedirs(dirname, exist_ok=True)
        elif not os.path.isdir(dirname):
            os.remove(dirname)
            os.makedirs(dirname, exist_ok=True)

        # Save the title and the text both
        filename = '{}/{}'.format(dirname, title)
        f = open(filename, 'w')
        f.write(text)
        f.close()
开发者ID:DrEricEbert,项目名称:scrapy-finance,代码行数:55,代码来源:qplum.py


示例17: _extract_links

    def _extract_links(self, response_text, response_url, response_encoding, base_url=None):
        if base_url is None:
            base_url = urljoin(response_url, self.base_url) if self.base_url else response_url

        clean_url = lambda u: urljoin(base_url, replace_entities(clean_link(u.decode(response_encoding))))
        clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip()

        links_text = linkre.findall(response_text)
        return [Link(clean_url(url).encode(response_encoding),
                     clean_text(text))
                for url, _, text in links_text]
开发者ID:0326,项目名称:scrapy,代码行数:11,代码来源:regex.py


示例18: _extract_description

 def _extract_description(self, sel, item):
     description_xpath = '//div[@class="description"]/ul/li/node()'
     data = sel.xpath(description_xpath).extract()
     if len(data) > 0:
         data = [remove_tags(v.strip()) for v in data]
         description = ''
         for index,desc in enumerate(data):
             if index % 2 == 0:
                 description += desc
             else :
                 description += desc + ';'
         item['description'] = description
开发者ID:revotu,项目名称:dev-crawler,代码行数:12,代码来源:dhgate.py


示例19: parse_item

    def parse_item(self, response):
        if(collect().count() < 10000):
        	# print '*******', response.url
        	hxs = HtmlXPathSelector(response)
        	titles=hxs.select("//div[@id='articleNew']/h1/text()").extract()

        	if len(titles) == 0: return

        	title=''.join(titles).strip()

        	txts=hxs.select("//div[@id='articleNew']/p").extract()
        	conteudo=remove_comments(remove_tags(''.join(txts)))

        	i = Artigo()
        	i['url']=response.url
        	i['nome']=title
        	i['conteudo']=conteudo 
          
        	#opiniao = {"url":response.url, "nome":title, "conteudo": conteudo}
                opiniao2 = {"conteudo":conteudo}
                
             #   collect().insert(opiniao) # Colecao Opinioes : Todas opinioes coletadas no Painel do Leitor
   
 ##############################################################################################################################
 # Filtrando por conteudo e direcionando para diferentes colecoes
 # Filtrando por conteudo e salvando em arquivo
                
                arqfile = leitorOpiniao
                frase = conteudo.split()
                if "Dilma" in frase:
                    #database()['dilma'].insert(opiniao2)          # Colecao dilma
                    arqfile = leitorDilma
                elif "Copa" in frase:
                    #database()['copa'].insert(opiniao2)           # Colecao copa
                    arqfile = leitorCopa
                elif "Palmeiras" in frase:
                    #database()['palmeiras'].insert(opiniao2)      # Colecao palmeiras
                    arqfile = leitorPalmeiras
                arq = open(arqfile, 'a')
                arq.writelines(str(opiniao2))
                arq.close()
            #    yield i # Mensagem na tela
		
		print '##########################################################'
	#	print ("TOTAL DE OPINIOES: %d" %collect().count())
                print ("Salvando em %s " %arqfile)
                print '##########################################################'

		
	else:
            	print 'Fim de scraping leitor'
		exit()
开发者ID:silviolima,项目名称:analise-Sentimento-nltk,代码行数:52,代码来源:folha.py


示例20: _extract_description

 def _extract_description(self, sel, item):
     description_xpath = '//div[@class="itemAttr"]/div[@class="section"]/table//tr/node()'
     data = sel.xpath(description_xpath).extract()
     if len(data) > 0 :
         data = [remove_tags(v).strip().replace('\t','').replace('\n','')  for v in data]
         data = filter(None,data)
         description = ''
         for index,desc in enumerate(data):
             if index % 2 == 0:
                 description += desc
             else :
                 description += desc + ';'
         item['description'] = description
开发者ID:revotu,项目名称:dev-crawler,代码行数:13,代码来源:ebay.py



注:本文中的w3lib.html.remove_tags函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python html.replace_entities函数代码示例发布时间:2022-05-26
下一篇:
Python html.get_meta_refresh函数代码示例发布时间:2022-05-26
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap