本文整理汇总了Python中tidylib.tidy_fragment函数的典型用法代码示例。如果您正苦于以下问题:Python tidy_fragment函数的具体用法?Python tidy_fragment怎么用?Python tidy_fragment使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了tidy_fragment函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: test_frag_with_entity
def test_frag_with_entity(self):
h = "é"
expected = "é"
doc, err = tidy_fragment(h)
self.assertEqual(doc, expected)
expected = "é"
doc, err = tidy_fragment(h, {'numeric-entities':1})
self.assertEqual(doc, expected)
开发者ID:18600597055,项目名称:hue,代码行数:9,代码来源:FragsTest.py
示例2: parse_book_file
def parse_book_file(href, book):
block = {}
book_tree = lxml.html.parse(join(books_dir, href), parser)
if not 'page_count' in book:
td = book_tree.xpath(
"//td[descendant::*[contains(text(), '{}')]]".format(
book['title'])
)
if len(td):
td = td[0]
page_info = td.xpath("descendant::*[contains(text(), 'страниц')]")
if len(page_info):
book['page_count'] = patterns[0][1].search(
tostring(page_info[0], encoding='unicode')).groups()[0]
block['annotation'] = book_tree.xpath(
r"//table[descendant::*[contains(text(), 'Аннотация')]]")
block['contents'] = book_tree.xpath(
r"//table[descendant::*[contains(text(), 'Содержание')]]")
for key in block:
if len(block[key]):
mark = block[key][-1]
book[key] = ""
for element in mark.itersiblings():
if element.tag == "table":
break
drop_a(element)
remove_attr(element)
book[key] += tostring(element, encoding='unicode')
book[key] = tidy_fragment(clean(book[key]))[0]
return book
开发者ID:a-iv,项目名称:practica.ru,代码行数:31,代码来源:oldsite_parser.py
示例3: test_frag_with_unclosed_tag
def test_frag_with_unclosed_tag(self):
h = "<p>hello"
expected = '''<p>
hello
</p>'''
doc, err = tidy_fragment(h)
self.assertEqual(doc, expected)
开发者ID:waylan,项目名称:pytidylib,代码行数:7,代码来源:FragsTest.py
示例4: sanitize_html
def sanitize_html(value):
from BeautifulSoup import BeautifulSoup, Comment, Tag
# FIXME: 'None' should never be saved as text
if value is None:
return ""
# allowed tags for a Vodafone Live <CONTAINER type="data" />
# this doubles up as a translation table. CKEditor does new-ish
# HTML than Vodafone Live will accept. We have to translate 'em' back
# to 'i', and 'strong' back to 'b'.
#
# NOTE: Order is important since <strong>'s can be inside <p>'s.
tags = (
("em", "i"), # when creating them in the editor they're EMs
("strong", "b"),
("i", "i"), # when loading them as I's the editor leaves them
("b", "b"), # we keep them here to prevent them from being removed
("u", "u"),
("br", "br"),
("p", "p"),
)
valid_tags = [tag for tag, replacement_tag in tags]
soup = BeautifulSoup(value)
# remove all comments from the HTML
for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
comment.extract()
# hide all tags that aren't in the allowed list, but keep
# their contents
for tag in soup.findAll(True):
# Vodafone Live allows for no tag attributes
tag.attrs = []
if tag.name not in valid_tags:
tag.hidden = True
# replace tags with Vlive equivelants
for element, replacement_element in tags:
if element is not replacement_element:
for tag in soup.findAll(element):
replacement_tag = Tag(soup, replacement_element)
replacement_tag.insert(0, tag.text)
tag.replaceWith(replacement_tag)
xml = soup.renderContents().decode("utf8")
fragment, errors = tidy_fragment(xml, {"char-encoding": "utf8"})
return (
fragment.replace(" ", " ")
.replace("’", "'")
.replace("‘", "'")
.replace(""", '"')
.replace("“", '"')
.replace("”", '"')
.replace("•", "- ")
.replace("é", "e")
.replace("É", "E")
.replace("–", "-")
)
开发者ID:praekelt,项目名称:ummeli,代码行数:60,代码来源:vlive_tags.py
示例5: clean
def clean( html ):
if not html:
return html
clean = bleach.clean( html, tags = local_config.TAG_WHITELIST, attributes = local_config.ATTRIBUTE_WHITELIST )
# catches some additional problems
tidy, warnings = tidylib.tidy_fragment( clean )
return tidy
开发者ID:mrwonko,项目名称:homepage,代码行数:7,代码来源:just_sanitize.py
示例6: link_title_uid_txt
def link_title_uid_txt(i):
if 'alternate' in i:
link = i['alternate'][0]['href']
else:
link = ''
if 'title' in i:
title = i['title']
title = unescape(title)
else:
title = '无题'
rss_uid = i.get('id') or 1
snippet = i.get('summary') or i.get('content') or None
if not snippet:
return
if snippet:
htm = snippet['content']
if not htm:
return
htm = txttidy(htm)
htm = txt_map('<pre', '</pre>', htm, pre_br)
htm = tidy_fragment(htm, {'indent': 0})[0]
htm = htm.replace('<br />', '\n')
txt = htm2txt(htm)
if not txt:
return
return link, title, rss_uid, txt
开发者ID:immissile,项目名称:42qu_github_mirror,代码行数:31,代码来源:rss_update.py
示例7: get_article_text
def get_article_text(self, body):
"""
Gets the article main text
:param body:
:return:
"""
raw_article_body = body.find("div", {"class": "article-body"})
article_body_no_html = raw_article_body
if article_body_no_html is not None:
article_body_no_html = article_body_no_html.get_text()
article_body_no_html = self.gremlin_zapper.zap_string(article_body_no_html)
if raw_article_body is not None:
self.zap_tag_contents(raw_article_body)
article_body = ''
for item in raw_article_body.contents:
article_body += str(item)
else:
article_body = ''
article_body, errors = tidy_fragment(article_body, options={'numeric-entities': 1})
return article_body, article_body_no_html
开发者ID:ucsc,项目名称:slug-news,代码行数:25,代码来源:scraper.py
示例8: test_frag_with_unicode_subclass
def test_frag_with_unicode_subclass(self):
class MyUnicode(utype):
pass
h = MyUnicode("unicode string ß")
expected = h
doc, err = tidy_fragment(h)
self.assertEqual(doc, expected)
开发者ID:waylan,项目名称:pytidylib,代码行数:8,代码来源:FragsTest.py
示例9: object_for_typepad_object
def object_for_typepad_object(tp_obj):
try:
obj = Object.objects.get(service='typepad.com', foreign_id=tp_obj.url_id)
except Object.DoesNotExist:
pass
else:
log.debug("Reusing typepad object %r for asset %s", obj, tp_obj.url_id)
return False, obj
log.debug("Making new object for TypePad post %s by %s", tp_obj.url_id, tp_obj.author.display_name)
author = account_for_typepad_user(tp_obj.author)
body = tp_obj.rendered_content
if not body and tp_obj.content:
if tp_obj.text_format == 'html_convert_linebreaks':
body = '\n\n'.join(u'<p>%s</p>' % t for t in tp_obj.content.split('\n\n'))
else:
body = tp_obj.content
if body:
body, errors = tidy_fragment(body)
else:
body = ''
obj = Object(
service='typepad.com',
foreign_id=tp_obj.url_id,
render_mode='mixed',
title=tp_obj.title,
body=body,
time=tp_obj.published,
permalink_url=tp_obj.permalink_url,
author=author,
)
if getattr(tp_obj, 'in_reply_to', None) is not None:
# This post is in reply, so we don't care if our referent was
# really a share. Be transitively in reply to the shared obj.
really_a_share, obj.in_reply_to = object_for_typepad_object(tp_obj.in_reply_to)
elif getattr(tp_obj, 'reblog_of', None) is not None:
# Assets are public so it's okay if we use an anonymous typd here.
t = typd.TypePad(endpoint='http://api.typepad.com/')
reblog_of = t.assets.get(tp_obj.reblog_of.url_id)
really_a_share, obj.in_reply_to = object_for_typepad_object(reblog_of)
remove_reblog_boilerplate_from_obj(obj)
if not obj.body:
return True, obj.in_reply_to
elif getattr(tp_obj, 'reblog_of_url', None) is not None:
reblog_url = tp_obj.reblog_of_url
try:
in_reply_to = leapfrog.poll.embedlam.object_for_url(reblog_url)
except leapfrog.poll.embedlam.RequestError, exc:
in_reply_to = None
except ValueError, exc:
in_reply_to = None
log.error("Error making object from referent %s of %s's post %s", reblog_url, author.display_name, tp_obj.url_id)
log.exception(exc)
开发者ID:apparentlymart,项目名称:leapfrog,代码行数:57,代码来源:typepad.py
示例10: tidy_html
def tidy_html(html):
"""
Process an input string containing HTML and return a tuple (xhtml,
errors, warnings) containing the output of tidylib and lists of
validation errors and warnings.
Input must be unicode.
Output will be valid XHTML.
"""
if not isinstance(html, unicode):
raise ValueError("tidyhtml must be called with a Unicode string!")
warnings = list()
# First, deal with embedded control codes:
html, sub_count = CONTROL_CHAR_RE.subn(" ", html)
if sub_count:
warnings.append("Stripped %d control characters from body: %s" % (
sub_count,
set(ord(i) for i in CONTROL_CHAR_RE.findall(html))
))
html, messages = tidylib.tidy_fragment(
html.strip(),
{
"char-encoding": "utf8",
"clean": False,
"drop-empty-paras": False,
"drop-font-tags": True,
"drop-proprietary-attributes": False,
"fix-backslash": True,
"indent": True,
"output-xhtml": True,
}
)
messages = filter(None, (l.strip() for l in messages.split("\n") if l))
# postprocess warnings to avoid HTML fragments being reported as lacking
# doctype and title:
errors = list()
warnings = list()
for msg in messages:
if "Warning: missing <!DOCTYPE> declaration" in msg:
continue
if "Warning: inserting missing 'title' element" in msg:
continue
if "Warning: inserting implicit <body>" in msg:
continue
if "Error:" in msg:
errors.append(msg)
else:
warnings.append(msg)
return html, errors, warnings
开发者ID:akaihola,项目名称:feincms,代码行数:57,代码来源:tidy.py
示例11: cleanupText
def cleanupText(text):
"""This method cleans up the text of the report using libtidy"""
# tidylib options
options = dict(output_xhtml=1, add_xml_decl=1, indent=1, tidy_mark=0, char_encoding="utf8", quote_nbsp=0)
# remove html entities from the text
ubody_text = unescape(text)
# clean up xhtml using tidy
aftertidy, errors = tidy_fragment(ubody_text.encode("utf8"), options, keep_doc=False)
# tidylib returns a <tidy.lib._Document object>
return str(aftertidy)
开发者ID:BenoitTalbot,项目名称:bungeni-portal,代码行数:10,代码来源:downloaddocument.py
示例12: html
def html(self, string):
"""Parses HTML"""
if "allow_html" not in INGIniousConfiguration or INGIniousConfiguration["allow_html"] == False:
raise Exception("HTML is not allowed")
elif INGIniousConfiguration["allow_html"] == "tidy":
import tidylib
out, dummy = tidylib.tidy_fragment(string)
return out
else:
return string
开发者ID:GuillaumeDerval,项目名称:INGInious,代码行数:10,代码来源:parsable_text.py
示例13: html2xhtml
def html2xhtml(html,**options):
options.update(doctype='omit')
options.update(show_warnings=0)
options.update(indent=0)
options.update(output_xml=1)
document, errors = tidy_fragment(html,options=options)
if errors:
#~ raise Exception(repr(errors))
raise Exception("Errors while processing %s\n==========\n%s" % (html,errors))
return document
开发者ID:MaxTyutyunnikov,项目名称:lino,代码行数:10,代码来源:html2xhtml.py
示例14: fix_open_tags
def fix_open_tags(source):
""" Fixes missing tags in html fragments. """
if not source:
return source
fixedhtml, errors = tidy_fragment(source)
if settings.DEBUG and errors:
errors = filter_tidylib_errors(errors)
if errors:
log.debug('Tidylib errors:\n{}'.format(errors))
return fixedhtml
开发者ID:welbornprod,项目名称:wp_site,代码行数:11,代码来源:htmltools.py
示例15: POST
def POST(self):
""" POST request """
web.header('Content-Type', 'application/json')
post_input = web.data()
try:
decoded_input = json.loads(post_input)
except:
return json.dumps({"correct": None, "score": 0, "msg": "<p>Internal grader error: cannot decode POST</p>"})
if "xqueue_body" not in decoded_input:
return json.dumps({"correct": None, "score": 0, "msg": "<p>Internal grader error: no xqueue_body in POST</p>"})
try:
edx_input = json.loads(decoded_input["xqueue_body"])
taskid = json.loads(edx_input["grader_payload"])["tid"]
except:
return json.dumps({"correct": None, "score": 0, "msg": "<p>Internal grader error: cannot decode JSON</p>"})
try:
task = course.get_task(taskid)
except:
return json.dumps({"correct": None, "score": 0, "msg": "<p>Internal grader error: unknown task {}</p>".format(taskid)})
if not task.input_is_consistent(edx_input):
return json.dumps({"correct": None, "score": 0, "msg": "<p>Internal grader error: input not consistent with task</p>"})
try:
job_return = job_manager_sync.new_job(task, edx_input, "Plugin - EDX")
except:
return json.dumps({"correct": None, "score": 0, "msg": "<p>Internal grader error: error while grading submission</p>"})
try:
text = ""
if "text" in job_return:
text = job_return["text"]
if "problems" in job_return:
for prob in job_return["problems"]:
text += "<br/><h4>" + job_return["task"].get_problems()[prob].get_name() + "</h4>" + job_return["problems"][prob]
score = (1 if job_return["result"] == "success" else 0)
if "score" in job_return:
score = job_return["score"]
import tidylib
out, dummy = tidylib.tidy_fragment(text, options={'output-xhtml': 1, 'enclose-block-text': 1, 'enclose-text': 1})
return json.dumps({"correct": (True if (job_return["result"] == "success") else None), "score": score, "msg": out})
except:
return json.dumps({"correct": None, "score": 0, "msg": "<p>Internal grader error: error converting submission result</p>"})
开发者ID:jonsan21,项目名称:INGInious,代码行数:50,代码来源:edx.py
示例16: normalize
def normalize(text):
""" Normalize whitespace for a string of html using tidylib. """
output, errors = tidylib.tidy_fragment(text, options={
'drop_empty_paras':0,
'fix_backslash':0,
'fix_bad_comments':0,
'fix_uri':0,
'join_styles':0,
'lower_literals':0,
'merge_divs':0,
'output_xhtml':1,
'quote_ampersand':0,
'newline':'LF'})
return output
开发者ID:manikanta-kumar-allakki,项目名称:polls-india,代码行数:14,代码来源:__init__.py
示例17: html2xhtml
def html2xhtml(html, **options):
options.update(doctype='omit')
options.update(show_warnings=0)
options.update(indent=0)
# options.update(output_xml=1)
options.update(output_xhtml=1)
document, errors = tidy_fragment(html, options=options)
if errors:
#~ raise Exception(repr(errors))
raise Exception("Errors while processing %s\n==========\n%s" %
(html, errors))
# if document.startswith(WRAP_BEFORE):
# document = document[len(WRAP_BEFORE):]
# document = document[:-15]
return document.strip()
开发者ID:zhuangyan,项目名称:lino,代码行数:15,代码来源:html2xhtml.py
示例18: __init__
def __init__(self, op_html):
"""
Intializes this option with HTML. The HTML is validated before initializing the option.
The input HTML should be a snippet and not contain the `html`, `head`, `title`, nor `body` tags.
Throws an HTMLValidationException if the validation produces errors.
:param op_html: The string representation of the option HTML.
:return:
"""
document, errors = tidy_fragment("<!DOCTYPE html><html><head><title></title><body>%s</body></html>" % op_html)
# python is stupid
if len(errors) > 1:
print errors
raise HTMLValidationException()
else:
Option.__init__(self, op_html)
开发者ID:LegoStormtroopr,项目名称:SMPy,代码行数:17,代码来源:options.py
示例19: mytidy
def mytidy(content):
BASE_OPTIONS = {
"output-xhtml": 0, # XHTML instead of HTML4
"indent": 1, # Pretty; not too much of a performance hit
"indent-spaces":4,
"tab-size":4,
"tidy-mark": 0, # No tidy meta tag in output
"wrap": 0, # No wrapping
"alt-text": "", # Help ensure validation
"doctype": 'strict', # Little sense in transitional for tool-generated markup...
"force-output": 1, # May not get what you expect but you will get something
"char-encoding":'utf8',
"input-encoding":'utf8',
"output-encoding":'utf8',
}
content = tidy_fragment(content, BASE_OPTIONS)
return content[0]
开发者ID:bdrydyk,项目名称:wurdig,代码行数:17,代码来源:tidy_helper.py
示例20: normalize
def normalize(text):
""" Normalize whitespace for a string of html using tidylib. """
output, errors = tidylib.tidy_fragment(
text,
options={
"drop_empty_paras": 0,
"fix_backslash": 0,
"fix_bad_comments": 0,
"fix_uri": 0,
"join_styles": 0,
"lower_literals": 0,
"merge_divs": 0,
"output_xhtml": 1,
"quote_ampersand": 0,
"newline": "LF",
},
)
return output
开发者ID:Grassflying2,项目名称:Python-Markdown,代码行数:18,代码来源:__init__.py
注:本文中的tidylib.tidy_fragment函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论