• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

Python tidy.parseString函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中tidy.parseString函数的典型用法代码示例。如果您正苦于以下问题:Python parseString函数的具体用法?Python parseString怎么用?Python parseString使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了parseString函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: test_bad_option_values

 def test_bad_option_values(self):
     badopts = [{"indent": "---"}, {"indent_spaces": None}]
     for opts in badopts:
         with self.assertRaisesRegexp(
             tidy.OptionArgError, "missing or malformed argument"
         ):
             tidy.parseString(self.input2, **opts)
开发者ID:nijel,项目名称:utidylib,代码行数:7,代码来源:test_tidy.py


示例2: test_bad_options

 def test_bad_options(self):
     badopts = [{"foo": 1}]
     for opts in badopts:
         with self.assertRaisesRegexp(
             tidy.InvalidOptionError, "not a valid Tidy option"
         ):
             tidy.parseString(self.input2, **opts)
开发者ID:nijel,项目名称:utidylib,代码行数:7,代码来源:test_tidy.py


示例3: test_encodings

 def test_encodings(self):
     foo = file('foo.htm').read().decode('utf8').encode('ascii', 
                                                        'xmlcharrefreplace')
     doc1u = tidy.parseString(foo, input_encoding='ascii',
                              output_encoding='latin1')
     self.failUnless(str(doc1u).find('\xe9')>=0)
     doc2u = tidy.parseString(foo, input_encoding='ascii',
                              output_encoding='utf8')
     self.failUnless(str(doc2u).find('\xc3\xa9')>=0)
开发者ID:corydodt,项目名称:uTidylib,代码行数:9,代码来源:test_tidy.py


示例4: test_badOptions

 def test_badOptions(self):
     badopts = [{'foo': 1}, {'indent': '---'}, {'indent_spaces': None}]
     for dct in badopts:
         try:
             tidy.parseString(self.input2, **dct)
         except tidy.TidyLibError:
             pass
         else:
             self.fail("Invalid option %s should have raised an error" %
                       repr(dct))
开发者ID:corydodt,项目名称:uTidylib,代码行数:10,代码来源:test_tidy.py


示例5: test_encodings

 def test_encodings(self):
     text = (
         open(self.test_file, "rb")
         .read()
         .decode("utf8")
         .encode("ascii", "xmlcharrefreplace")
     )
     doc1u = tidy.parseString(text, input_encoding="ascii", output_encoding="latin1")
     self.assertTrue(doc1u.getvalue().find(b"\xe9") >= 0)
     doc2u = tidy.parseString(text, input_encoding="ascii", output_encoding="utf8")
     self.assertTrue(doc2u.getvalue().find(b"\xc3\xa9") >= 0)
开发者ID:nijel,项目名称:utidylib,代码行数:11,代码来源:test_tidy.py


示例6: test_options

 def test_options(self):
     doc1 = tidy.parseString(
         self.input1, add_xml_decl=1, show_errors=1, newline="CR", output_xhtml=1
     )
     self.assertIn("CDATA", str(doc1))
     doc2 = tidy.parseString(
         "<Html>", add_xml_decl=1, show_errors=1, newline="CR", output_xhtml=1
     )
     self.assertTrue(str(doc2).startswith("<?xml"))
     self.assertFalse(len(doc2.errors) == 0)
     self.assertNotIn("\n", str(doc2))
     doc3 = tidy.parse(self.test_file, char_encoding="utf8", alt_text="foo")
     self.assertIn('alt="foo"', doc3.gettext())
     self.assertIn("é", doc3.gettext())
开发者ID:nijel,项目名称:utidylib,代码行数:14,代码来源:test_tidy.py


示例7: load_doc_file

def load_doc_file(filename, f):
	tidyopts = dict(drop_proprietary_attributes=1,
				alt_text='',
				hide_comments=1,
				output_xhtml=1,
				show_body_only=1,
				clean=1,
				char_encoding='utf8',
				indent='auto',
			)

	contents = unicode(f.read(),'latin1')
	tm = re_titlematch.search(contents)
	if tm:
		title = tm.group(1)
	else:
		title = ""
	if not quiet: print "--- file: %s (%s) ---" % (filename, title)

	s = tidy.parseString(contents.encode('utf-8'), **tidyopts)
	curs.execute("INSERT INTO docs (file, version, title, content) VALUES (%(f)s, %(v)s, %(t)s, %(c)s)",{
		'f': filename,
		'v': ver,
		't': title,
		'c': str(s),
	})
	global pagecount
	pagecount += 1
开发者ID:ChristophBerg,项目名称:pgweb,代码行数:28,代码来源:docload.py


示例8: get_page_title

def get_page_title(content):
  try:
    content = str(tidy.parseString(content, output_xhtml=True, add_xml_decl=True, indent=False, tidy_mark=False))
    content = ENTITY.sub(ENTITY_REP, content)
  
  #~ f = open("tmp.log", "w")
  #~ f.write(content)
  #~ f.close()
  
    root = etree.fromstring(content)
  
    head = root.find("{http://www.w3.org/1999/xhtml}head")
    title = head.find("{http://www.w3.org/1999/xhtml}title")
    titletext = title.text
    
    time.sleep(0.5)
    
    return titletext
  
  except Exception, e:
    print "\tHTML Parser Error:", str(e)
    
    m = R_TITLE.search(content)
    if m is not None:
      return m.group(1)
    
    return ""
开发者ID:iand,项目名称:talisians,代码行数:27,代码来源:scanner.py


示例9: issue

def issue(answers_xml):

    # validate the answers
    # validateAnswers(answers_xml)
        
    # generate the answers XML document
    ctxt = validateAnswers(answers_xml) # lxml.etree.parse(StringIO(answers_xml)) 

    # apply the xslt transform
    transform = lxml.etree.XSLT(
        lxml.etree.parse(XSLT_SOURCE)
        )

    result = transform.apply(ctxt)

    # return the transformed document, after passing it through tidy
    return transform.tostring(result)

    try:
        return str(tidy.parseString(transform.tostring(result),
                                output_xml=1, input_xml=1, tidy_mark=0, indent=1))
    except:
        # if something goes wrong with Tidy, just return the version with 
        # the fucked img tag
        return transform.tostring(result)
开发者ID:cc-archive,项目名称:api,代码行数:25,代码来源:support.py


示例10: tidyhtml

def tidyhtml(html):
    """simply tidies up html code, returning xhtml"""
    if isinstance(html, unicode):
        html = html.encode("utf-8")
    html = tidy.parseString(html, output_xhtml=1, tidy_mark=0, input_encoding="utf8", output_encoding="utf8")
    html = str(html)
    return html
开发者ID:cc-archive,项目名称:jtoolkit,代码行数:7,代码来源:tidywidget.py


示例11: clean

def clean(txt):
    return unicode(str(tidy.parseString(txt, **{'output_xhtml' : 1,
                                                'add_xml_decl' : 0,
                                                'indent' : 0,
                                                'tidy_mark' : 0,
                                                'doctype' : "strict",
                                                'wrap' : 0})),'utf8')
开发者ID:dnet,项目名称:f33dme,代码行数:7,代码来源:fetch.py


示例12: tidy_html

def tidy_html(html_buffer, cleaning_lib='utidylib'):
    """
    Tidy up the input HTML using one of the installed cleaning
    libraries.

    @param html_buffer: the input HTML to clean up
    @type html_buffer: string
    @param cleaning_lib: chose the preferred library to clean the HTML. One of:
                         - utidylib
                         - beautifulsoup
    @return: a cleaned version of the input HTML
    @note: requires uTidylib or BeautifulSoup to be installed. If the chosen library is missing, the input X{html_buffer} is returned I{as is}.
    """

    if CFG_TIDY_INSTALLED and cleaning_lib == 'utidylib':
        options = dict(output_xhtml=1,
                       show_body_only=1,
                       merge_divs=0,
                       wrap=0)
        try:
            output = str(tidy.parseString(html_buffer, **options))
        except:
            output = html_buffer
    elif CFG_BEAUTIFULSOUP_INSTALLED and cleaning_lib == 'beautifulsoup':
        try:
            output = str(BeautifulSoup(html_buffer).prettify())
        except:
            output = html_buffer
    else:
        output = html_buffer

    return output
开发者ID:AlbertoPeon,项目名称:invenio,代码行数:32,代码来源:htmlutils.py


示例13: _tidy2

        def _tidy2(text):
            """uTidyLib's XHTML validator.

            This function is a wrapper to uTidyLib's validator.
            """
            text = tidy.parseString(text,  output_xhtml=1, add_xml_decl=0, indent=0, tidy_mark=0)
            return _in_tag(str(text), 'body')
开发者ID:nnevvinn,项目名称:crockersrules,代码行数:7,代码来源:textile.py


示例14: to_xhtml

 def to_xhtml(self, stylesheet_url='', settings=DEFAULT_HTML_OVERRIDES,
 tidy_settings=DEFAULT_TIDY_XHTML_OPTIONS, *args, **kwargs):
     if 'tidy_output' in kwargs:
         del kwargs['tidy_output']
     html_string, discard = self.to_html(stylesheet_url, tidy_output=False,
         *args, **kwargs)
     return str(tidy.parseString(html_string, **tidy_settings)), []
开发者ID:pombredanne,项目名称:rst2a,代码行数:7,代码来源:rst2a.py


示例15: run

 def run(self, text):
     # Pass text to Tidy. As Tidy does not accept unicode we need to encode
     # it and decode its return value.
     enc = self.markdown.tidy_options.get('char_encoding', 'utf8')
     return unicode(tidy.parseString(text.encode(enc), 
                                     **self.markdown.tidy_options),
                    encoding=enc) 
开发者ID:2770862886,项目名称:Quicksilver,代码行数:7,代码来源:html_tidy.py


示例16: __init__

    def __init__(self, docid=None, *args,**kwargs):
        self.__dict__['type'] = 'etherpad'
        if docid:
            hostValidator = PADRE.search(docid)
            if hostValidator:
                if hostValidator.group(2) and hostValidator.group(3):
                    docid=("%s/%s" % (hostValidator.group(2), hostValidator.group(3))).encode('utf8')
                    kwargs['docid']=docid
                url="%s%s/ep/pad/export/%s/latest?format=html" % (hostValidator.group(1) or 'http://', hostValidator.group(2), hostValidator.group(3))
                if not Docs.find_one({"docid": docid}):
                    context = urllib2.urlopen(url).read()
                    soup = BeautifulSoup(context)
                    self.__dict__['title']=unescape(unicode(''.join(soup.title.findAll(text=True)))).strip().encode('utf8')

                    doc='<html><head><title>%s</title><meta http-equiv="content-type" content="text/html; charset=utf-8" /></head>%s</html>' % (self.title, unescape(unicode(soup.body)).encode('utf8'))
                    raw=str(tidy.parseString(doc, **{'output_xhtml' : 1,
                                                             'add_xml_decl' : 0,
                                                             'indent' : 0,
                                                             'tidy_mark' : 0,
                                                             'doctype' : "strict",
                                                             'wrap' : 0}))
                    kwargs['raw'] = raw
                    kwargs['docid']=docid
                    super(Etherpad,self).__init__(*args, **kwargs)
                    if not 'stems' in self.__dict__ or not self.stems:
                        # let's calculate and cache the results
                        models.tfidf.add_input_document(self.termcnt.keys())
                        self.save()
                    return
            kwargs['docid']=docid
        super(Etherpad,self).__init__(*args, **kwargs)
开发者ID:asciimoo,项目名称:le-n-x,代码行数:31,代码来源:etherpad.py


示例17: mergestore

 def mergestore(self, inputstore, templatetext, includefuzzy):
     """converts a file to .po format"""
     htmlresult = templatetext.replace("\n", " ")
     if isinstance(htmlresult, str):
         #TODO: get the correct encoding
         htmlresult = htmlresult.decode('utf-8')
     # TODO: use the algorithm from html2po to get blocks and translate them individually
     # rather than using replace
     for inputunit in inputstore.units:
         if inputunit.isheader():
             continue
         msgid = inputunit.source
         msgstr = None
         if includefuzzy or not inputunit.isfuzzy():
             msgstr = self.wrapmessage(inputunit.target)
         else:
             msgstr = self.wrapmessage(inputunit.source)
         if msgstr.strip():
             # TODO: "msgid" is already html-encoded ("&" -> "&amp;"), while
             #   "msgstr" is not encoded -> thus the replace fails
             #   see test_po2html.py in line 67
             htmlresult = htmlresult.replace(msgid, msgstr, 1)
     htmlresult = htmlresult.encode('utf-8')
     if self.tidy:
         htmlresult = str(tidy.parseString(htmlresult))
     return htmlresult
开发者ID:AlexArgus,项目名称:affiliates-lib,代码行数:26,代码来源:po2html.py


示例18: fix_html

def fix_html(htmlstr):
    options = dict()
    options['output_xhtml'] = 1
    options['tidy_mark'] = 0
    options['numeric_entities'] = 1

    return str(tidy.parseString(htmlstr, **options))
开发者ID:undersea,项目名称:curriculum-information-representation,代码行数:7,代码来源:scheduleparser.py


示例19: stripFluff

def stripFluff(html):
    """Return a string of html content.
    
    Takes an auto-generated html page and strips out the fluff
    e.g. extra inline styles, extraneous spans etc. and returns
    a well-formed and plain html version.  Only captures stuff
    within the body tag.  """

    options = dict(output_xhtml=1,indent=0,tidy_mark=0,
                   clean=1, drop_empty_paras=1, drop_font_tags=1,
                   drop_proprietary_attributes=1, enclose_block_text=1,
                   literal_attributes=1, logical_emphasis=1, merge_divs=0,
                   error_file='tidyerror.log', gnu_emacs=1, bare=1)
    html = str(tidy.parseString(html, **options))

    pattern = r'<body.*?</body>'

    temp = re.findall(pattern, html, re.DOTALL|re.I)[0]
    temp = removePattern(temp, r'<body.*?>')
    temp = temp.replace('</body>', '')
    #temp = removePattern(temp, r'\r\n')
    temp = cleanLi(temp)
##    temp = removePattern(temp, r'<SPAN.*?>')
##    temp = temp.replace('</SPAN>', '')
##    temp = removePattern(temp, r'<FONT.*?>')
##    temp = temp.replace('</FONT>', '')
    temp = removePattern(temp, r'style=".*?"')
    temp = removePattern(temp, r'target=".*?"')
    temp = removePattern(temp, r'class=".*?"')
    temp = temp.replace('<br>', '<br />')
    temp = lowerTags(temp)
    return temp
开发者ID:RadicalZephyr,项目名称:Python-Stuff,代码行数:32,代码来源:hdi2.py


示例20: cleanupText

 def cleanupText(self):
     '''This function generates an ODT document from the text of a report'''
     #This should really be at the top of this file.
     #Leaving it here for the time being so that having 
     #libtidy is not a requirement to run bungeni
     import tidy
     body_text = removeSecurityProxy(self.context.body_text)
     #utidylib options
     options = dict(output_xhtml=1, 
                 add_xml_decl=1, 
                 indent=1, 
                 tidy_mark=0,
                 char_encoding='utf8',
                 quote_nbsp=0)
     #remove html entities from the text
     ubody_text = unescape(body_text)
     #clean up xhtml using tidy
     aftertidy = tidy.parseString(ubody_text.encode('utf8'), **options)
     #tidy returns a <tidy.lib._Document object>
     dom = parseString(str(aftertidy))
     nodeList = dom.getElementsByTagName("body")
     text = ""
     for childNode in nodeList[0].childNodes:
         text += childNode.toxml()
     dom.unlink()
     return text
开发者ID:BenoitTalbot,项目名称:bungeni-portal,代码行数:26,代码来源:reports.py



注:本文中的tidy.parseString函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python tidylib.tidy_document函数代码示例发布时间:2022-05-27
下一篇:
Python utils.get_store函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap