本文整理汇总了Python中tidy.parseString函数的典型用法代码示例。如果您正苦于以下问题:Python parseString函数的具体用法?Python parseString怎么用?Python parseString使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了parseString函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: test_bad_option_values
def test_bad_option_values(self):
badopts = [{"indent": "---"}, {"indent_spaces": None}]
for opts in badopts:
with self.assertRaisesRegexp(
tidy.OptionArgError, "missing or malformed argument"
):
tidy.parseString(self.input2, **opts)
开发者ID:nijel,项目名称:utidylib,代码行数:7,代码来源:test_tidy.py
示例2: test_bad_options
def test_bad_options(self):
badopts = [{"foo": 1}]
for opts in badopts:
with self.assertRaisesRegexp(
tidy.InvalidOptionError, "not a valid Tidy option"
):
tidy.parseString(self.input2, **opts)
开发者ID:nijel,项目名称:utidylib,代码行数:7,代码来源:test_tidy.py
示例3: test_encodings
def test_encodings(self):
foo = file('foo.htm').read().decode('utf8').encode('ascii',
'xmlcharrefreplace')
doc1u = tidy.parseString(foo, input_encoding='ascii',
output_encoding='latin1')
self.failUnless(str(doc1u).find('\xe9')>=0)
doc2u = tidy.parseString(foo, input_encoding='ascii',
output_encoding='utf8')
self.failUnless(str(doc2u).find('\xc3\xa9')>=0)
开发者ID:corydodt,项目名称:uTidylib,代码行数:9,代码来源:test_tidy.py
示例4: test_badOptions
def test_badOptions(self):
badopts = [{'foo': 1}, {'indent': '---'}, {'indent_spaces': None}]
for dct in badopts:
try:
tidy.parseString(self.input2, **dct)
except tidy.TidyLibError:
pass
else:
self.fail("Invalid option %s should have raised an error" %
repr(dct))
开发者ID:corydodt,项目名称:uTidylib,代码行数:10,代码来源:test_tidy.py
示例5: test_encodings
def test_encodings(self):
text = (
open(self.test_file, "rb")
.read()
.decode("utf8")
.encode("ascii", "xmlcharrefreplace")
)
doc1u = tidy.parseString(text, input_encoding="ascii", output_encoding="latin1")
self.assertTrue(doc1u.getvalue().find(b"\xe9") >= 0)
doc2u = tidy.parseString(text, input_encoding="ascii", output_encoding="utf8")
self.assertTrue(doc2u.getvalue().find(b"\xc3\xa9") >= 0)
开发者ID:nijel,项目名称:utidylib,代码行数:11,代码来源:test_tidy.py
示例6: test_options
def test_options(self):
doc1 = tidy.parseString(
self.input1, add_xml_decl=1, show_errors=1, newline="CR", output_xhtml=1
)
self.assertIn("CDATA", str(doc1))
doc2 = tidy.parseString(
"<Html>", add_xml_decl=1, show_errors=1, newline="CR", output_xhtml=1
)
self.assertTrue(str(doc2).startswith("<?xml"))
self.assertFalse(len(doc2.errors) == 0)
self.assertNotIn("\n", str(doc2))
doc3 = tidy.parse(self.test_file, char_encoding="utf8", alt_text="foo")
self.assertIn('alt="foo"', doc3.gettext())
self.assertIn("é", doc3.gettext())
开发者ID:nijel,项目名称:utidylib,代码行数:14,代码来源:test_tidy.py
示例7: load_doc_file
def load_doc_file(filename, f):
tidyopts = dict(drop_proprietary_attributes=1,
alt_text='',
hide_comments=1,
output_xhtml=1,
show_body_only=1,
clean=1,
char_encoding='utf8',
indent='auto',
)
contents = unicode(f.read(),'latin1')
tm = re_titlematch.search(contents)
if tm:
title = tm.group(1)
else:
title = ""
if not quiet: print "--- file: %s (%s) ---" % (filename, title)
s = tidy.parseString(contents.encode('utf-8'), **tidyopts)
curs.execute("INSERT INTO docs (file, version, title, content) VALUES (%(f)s, %(v)s, %(t)s, %(c)s)",{
'f': filename,
'v': ver,
't': title,
'c': str(s),
})
global pagecount
pagecount += 1
开发者ID:ChristophBerg,项目名称:pgweb,代码行数:28,代码来源:docload.py
示例8: get_page_title
def get_page_title(content):
try:
content = str(tidy.parseString(content, output_xhtml=True, add_xml_decl=True, indent=False, tidy_mark=False))
content = ENTITY.sub(ENTITY_REP, content)
#~ f = open("tmp.log", "w")
#~ f.write(content)
#~ f.close()
root = etree.fromstring(content)
head = root.find("{http://www.w3.org/1999/xhtml}head")
title = head.find("{http://www.w3.org/1999/xhtml}title")
titletext = title.text
time.sleep(0.5)
return titletext
except Exception, e:
print "\tHTML Parser Error:", str(e)
m = R_TITLE.search(content)
if m is not None:
return m.group(1)
return ""
开发者ID:iand,项目名称:talisians,代码行数:27,代码来源:scanner.py
示例9: issue
def issue(answers_xml):
# validate the answers
# validateAnswers(answers_xml)
# generate the answers XML document
ctxt = validateAnswers(answers_xml) # lxml.etree.parse(StringIO(answers_xml))
# apply the xslt transform
transform = lxml.etree.XSLT(
lxml.etree.parse(XSLT_SOURCE)
)
result = transform.apply(ctxt)
# return the transformed document, after passing it through tidy
return transform.tostring(result)
try:
return str(tidy.parseString(transform.tostring(result),
output_xml=1, input_xml=1, tidy_mark=0, indent=1))
except:
# if something goes wrong with Tidy, just return the version with
# the fucked img tag
return transform.tostring(result)
开发者ID:cc-archive,项目名称:api,代码行数:25,代码来源:support.py
示例10: tidyhtml
def tidyhtml(html):
"""simply tidies up html code, returning xhtml"""
if isinstance(html, unicode):
html = html.encode("utf-8")
html = tidy.parseString(html, output_xhtml=1, tidy_mark=0, input_encoding="utf8", output_encoding="utf8")
html = str(html)
return html
开发者ID:cc-archive,项目名称:jtoolkit,代码行数:7,代码来源:tidywidget.py
示例11: clean
def clean(txt):
return unicode(str(tidy.parseString(txt, **{'output_xhtml' : 1,
'add_xml_decl' : 0,
'indent' : 0,
'tidy_mark' : 0,
'doctype' : "strict",
'wrap' : 0})),'utf8')
开发者ID:dnet,项目名称:f33dme,代码行数:7,代码来源:fetch.py
示例12: tidy_html
def tidy_html(html_buffer, cleaning_lib='utidylib'):
"""
Tidy up the input HTML using one of the installed cleaning
libraries.
@param html_buffer: the input HTML to clean up
@type html_buffer: string
@param cleaning_lib: chose the preferred library to clean the HTML. One of:
- utidylib
- beautifulsoup
@return: a cleaned version of the input HTML
@note: requires uTidylib or BeautifulSoup to be installed. If the chosen library is missing, the input X{html_buffer} is returned I{as is}.
"""
if CFG_TIDY_INSTALLED and cleaning_lib == 'utidylib':
options = dict(output_xhtml=1,
show_body_only=1,
merge_divs=0,
wrap=0)
try:
output = str(tidy.parseString(html_buffer, **options))
except:
output = html_buffer
elif CFG_BEAUTIFULSOUP_INSTALLED and cleaning_lib == 'beautifulsoup':
try:
output = str(BeautifulSoup(html_buffer).prettify())
except:
output = html_buffer
else:
output = html_buffer
return output
开发者ID:AlbertoPeon,项目名称:invenio,代码行数:32,代码来源:htmlutils.py
示例13: _tidy2
def _tidy2(text):
"""uTidyLib's XHTML validator.
This function is a wrapper to uTidyLib's validator.
"""
text = tidy.parseString(text, output_xhtml=1, add_xml_decl=0, indent=0, tidy_mark=0)
return _in_tag(str(text), 'body')
开发者ID:nnevvinn,项目名称:crockersrules,代码行数:7,代码来源:textile.py
示例14: to_xhtml
def to_xhtml(self, stylesheet_url='', settings=DEFAULT_HTML_OVERRIDES,
tidy_settings=DEFAULT_TIDY_XHTML_OPTIONS, *args, **kwargs):
if 'tidy_output' in kwargs:
del kwargs['tidy_output']
html_string, discard = self.to_html(stylesheet_url, tidy_output=False,
*args, **kwargs)
return str(tidy.parseString(html_string, **tidy_settings)), []
开发者ID:pombredanne,项目名称:rst2a,代码行数:7,代码来源:rst2a.py
示例15: run
def run(self, text):
# Pass text to Tidy. As Tidy does not accept unicode we need to encode
# it and decode its return value.
enc = self.markdown.tidy_options.get('char_encoding', 'utf8')
return unicode(tidy.parseString(text.encode(enc),
**self.markdown.tidy_options),
encoding=enc)
开发者ID:2770862886,项目名称:Quicksilver,代码行数:7,代码来源:html_tidy.py
示例16: __init__
def __init__(self, docid=None, *args,**kwargs):
self.__dict__['type'] = 'etherpad'
if docid:
hostValidator = PADRE.search(docid)
if hostValidator:
if hostValidator.group(2) and hostValidator.group(3):
docid=("%s/%s" % (hostValidator.group(2), hostValidator.group(3))).encode('utf8')
kwargs['docid']=docid
url="%s%s/ep/pad/export/%s/latest?format=html" % (hostValidator.group(1) or 'http://', hostValidator.group(2), hostValidator.group(3))
if not Docs.find_one({"docid": docid}):
context = urllib2.urlopen(url).read()
soup = BeautifulSoup(context)
self.__dict__['title']=unescape(unicode(''.join(soup.title.findAll(text=True)))).strip().encode('utf8')
doc='<html><head><title>%s</title><meta http-equiv="content-type" content="text/html; charset=utf-8" /></head>%s</html>' % (self.title, unescape(unicode(soup.body)).encode('utf8'))
raw=str(tidy.parseString(doc, **{'output_xhtml' : 1,
'add_xml_decl' : 0,
'indent' : 0,
'tidy_mark' : 0,
'doctype' : "strict",
'wrap' : 0}))
kwargs['raw'] = raw
kwargs['docid']=docid
super(Etherpad,self).__init__(*args, **kwargs)
if not 'stems' in self.__dict__ or not self.stems:
# let's calculate and cache the results
models.tfidf.add_input_document(self.termcnt.keys())
self.save()
return
kwargs['docid']=docid
super(Etherpad,self).__init__(*args, **kwargs)
开发者ID:asciimoo,项目名称:le-n-x,代码行数:31,代码来源:etherpad.py
示例17: mergestore
def mergestore(self, inputstore, templatetext, includefuzzy):
"""converts a file to .po format"""
htmlresult = templatetext.replace("\n", " ")
if isinstance(htmlresult, str):
#TODO: get the correct encoding
htmlresult = htmlresult.decode('utf-8')
# TODO: use the algorithm from html2po to get blocks and translate them individually
# rather than using replace
for inputunit in inputstore.units:
if inputunit.isheader():
continue
msgid = inputunit.source
msgstr = None
if includefuzzy or not inputunit.isfuzzy():
msgstr = self.wrapmessage(inputunit.target)
else:
msgstr = self.wrapmessage(inputunit.source)
if msgstr.strip():
# TODO: "msgid" is already html-encoded ("&" -> "&"), while
# "msgstr" is not encoded -> thus the replace fails
# see test_po2html.py in line 67
htmlresult = htmlresult.replace(msgid, msgstr, 1)
htmlresult = htmlresult.encode('utf-8')
if self.tidy:
htmlresult = str(tidy.parseString(htmlresult))
return htmlresult
开发者ID:AlexArgus,项目名称:affiliates-lib,代码行数:26,代码来源:po2html.py
示例18: fix_html
def fix_html(htmlstr):
options = dict()
options['output_xhtml'] = 1
options['tidy_mark'] = 0
options['numeric_entities'] = 1
return str(tidy.parseString(htmlstr, **options))
开发者ID:undersea,项目名称:curriculum-information-representation,代码行数:7,代码来源:scheduleparser.py
示例19: stripFluff
def stripFluff(html):
"""Return a string of html content.
Takes an auto-generated html page and strips out the fluff
e.g. extra inline styles, extraneous spans etc. and returns
a well-formed and plain html version. Only captures stuff
within the body tag. """
options = dict(output_xhtml=1,indent=0,tidy_mark=0,
clean=1, drop_empty_paras=1, drop_font_tags=1,
drop_proprietary_attributes=1, enclose_block_text=1,
literal_attributes=1, logical_emphasis=1, merge_divs=0,
error_file='tidyerror.log', gnu_emacs=1, bare=1)
html = str(tidy.parseString(html, **options))
pattern = r'<body.*?</body>'
temp = re.findall(pattern, html, re.DOTALL|re.I)[0]
temp = removePattern(temp, r'<body.*?>')
temp = temp.replace('</body>', '')
#temp = removePattern(temp, r'\r\n')
temp = cleanLi(temp)
## temp = removePattern(temp, r'<SPAN.*?>')
## temp = temp.replace('</SPAN>', '')
## temp = removePattern(temp, r'<FONT.*?>')
## temp = temp.replace('</FONT>', '')
temp = removePattern(temp, r'style=".*?"')
temp = removePattern(temp, r'target=".*?"')
temp = removePattern(temp, r'class=".*?"')
temp = temp.replace('<br>', '<br />')
temp = lowerTags(temp)
return temp
开发者ID:RadicalZephyr,项目名称:Python-Stuff,代码行数:32,代码来源:hdi2.py
示例20: cleanupText
def cleanupText(self):
'''This function generates an ODT document from the text of a report'''
#This should really be at the top of this file.
#Leaving it here for the time being so that having
#libtidy is not a requirement to run bungeni
import tidy
body_text = removeSecurityProxy(self.context.body_text)
#utidylib options
options = dict(output_xhtml=1,
add_xml_decl=1,
indent=1,
tidy_mark=0,
char_encoding='utf8',
quote_nbsp=0)
#remove html entities from the text
ubody_text = unescape(body_text)
#clean up xhtml using tidy
aftertidy = tidy.parseString(ubody_text.encode('utf8'), **options)
#tidy returns a <tidy.lib._Document object>
dom = parseString(str(aftertidy))
nodeList = dom.getElementsByTagName("body")
text = ""
for childNode in nodeList[0].childNodes:
text += childNode.toxml()
dom.unlink()
return text
开发者ID:BenoitTalbot,项目名称:bungeni-portal,代码行数:26,代码来源:reports.py
注:本文中的tidy.parseString函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论