本文整理汇总了Python中tidylib.tidy_document函数的典型用法代码示例。如果您正苦于以下问题:Python tidy_document函数的具体用法?Python tidy_document怎么用?Python tidy_document使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了tidy_document函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: test_doc_with_entity
def test_doc_with_entity(self):
h = "é"
expected = DOC % "é"
doc, err = tidy_document(h)
self.assertEqual(doc, expected)
expected = DOC % "é"
doc, err = tidy_document(h, {'numeric-entities':1})
self.assertEqual(doc, expected)
开发者ID:18600597055,项目名称:hue,代码行数:9,代码来源:DocsTest.py
示例2: _massage_diff_content
def _massage_diff_content(content):
tidy_options = {
'output-xhtml': 0,
'force-output': 1,
}
try:
content = tidy_document(content, options=tidy_options)
except UnicodeDecodeError:
# In case something happens in pytidylib we'll try again with
# a proper encoding
content = tidy_document(content.encode('utf-8'), options=tidy_options)
tidied, errors = content
content = tidied.decode('utf-8'), errors
return content
开发者ID:VitorVRS,项目名称:kuma,代码行数:14,代码来源:helpers.py
示例3: marklogic_put_xml
def marklogic_put_xml(self, item, spider_name):
# Set the uri and collection
if (self.ml_transform == ''):
params = {'uri': item['uri'], 'collection': self.ml_collections or spider_name}
else:
params = {'uri': item['uri'], 'collection': self.ml_collections or spider_name, 'transform': self.ml_transform}
# Set up the XML payload
payload = dicttoxml(dict(item), attr_type=False, custom_root='webcontent')
# Decode the <> characters back again
payload = payload.replace('<', '<').replace('>', '>').replace(''', "'").replace('"', '"')
# Run tidy in order to get wel-formed XML
payload, errors = tidy_document(payload, options={'input-xml': 1})
# Set up the header
headers = {'Content-Type': 'application/xml'}
ml_uri = ('ml_uri' in item and item['ml_uri']) or self.ml_uri
logging.info("PUTting XML in " + ml_uri + " as " + item['uri'])
# Call the MarkLogic REST endpoint
ml_user = ('ml_user' in item and item['ml_user']) or self.ml_user
ml_pwd = ('ml_pwd' in item and item['ml_pwd']) or self.ml_pwd
r = requests.put(ml_uri,
params = params,
auth = HTTPDigestAuth(ml_user, ml_pwd),
data = payload,
headers = headers)
logging.info("PUT response: " + str(r.status_code) + ", " + r.text)
开发者ID:michelderu,项目名称:ml-scrapy-pipeline,代码行数:29,代码来源:pipelines.py
示例4: fetch_data
def fetch_data():
def bvbreplace(s):
return "BVB" if "Dortmund" in s else s
doc = None
try:
doc, errs = tidy_document(urllib2.urlopen('http://www.bvb.de/').read(), tidyoptions)
soup = Soup(doc)
except Exception as e:
raise Exception(u"Error fetching/parsing website: %s" % e)
out = ''
matchtime = datetime.datetime.now() + datetime.timedelta(hours=25)
timestr = ''
try:
home = bvbreplace(select(soup, "div.next-match p span")[0].contents[0].strip())
guest = bvbreplace(select(soup, "div.next-match p span")[1].contents[0].strip())
league = ''
try:
league = select(soup, "div.next-match p span.tournament")[0].contents[0].strip()
except:
league = select(soup, "div.next-match p span")[2].contents[0].strip()
matchtime = datetime.datetime.strptime(select(soup, "div.next-match p")[1].contents[-1].strip(), u"%d.%m.%Y %H:%M")
timestr = matchtime.strftime(u"%a, %d.%m.%Y %H:%M")
dontgo = u"U42/U46/Kreuzviertel/Borsigplatz/Uni-Parkplatz" if u"BVB" == home else u"Kneipen mit TV in Dortmund"
location = u"Heim" if u"BVB" == home else u"Auswaerts"
out = u"WARNUNG! %s: %s vs %s (%s/%s). Meide %s." % (timestr, home, guest, location, league, dontgo)
except IndexError:
# This means: No next game on the webpage.
sys.exit(1)
except Exception as e:
#print(traceback.format_exc())
raise Exception(u"ERRBVB while parsing bvb.de: %s" % e)
return out, matchtime
开发者ID:orithena,项目名称:sportswarnbot,代码行数:34,代码来源:bvb.py
示例5: call
def call():
if world.results:
return
data = urllib.urlencode(world.params)
req = urllib2.Request(url="%s/%s?%s" % (world.base_url, world.requesttype, data),
headers=world.header)
fd = urllib2.urlopen(req)
page = fd.read()
fmt = world.params.get('format')
if fmt not in ('html', 'xml', 'json', 'jsonv2'):
fmt = 'xml' if world.requesttype == 'reverse' else 'html'
pageinfo = fd.info()
assert_equal('utf-8', pageinfo.getparam('charset').lower())
pagetype = pageinfo.gettype()
if fmt == 'html':
assert_equals('text/html', pagetype)
document, errors = tidy_document(page,
options={'char-encoding' : 'utf8'})
assert(len(errors) == 0), "Errors found in HTML document:\n%s" % errors
world.results = document
elif fmt == 'xml':
assert_equals('text/xml', pagetype)
world.results = parseString(page).documentElement
else:
if 'json_callback' in world.params:
func = world.params['json_callback']
assert page.startswith(func + '(')
assert page.endswith(')')
page = page[(len(func)+1):-1]
assert_equals('application/javascript', pagetype)
else:
assert_equals('application/json', pagetype)
world.results = json.JSONDecoder(object_pairs_hook=OrderedDict).decode(page)
开发者ID:mtmail,项目名称:test-nominatim,代码行数:35,代码来源:request_setup.py
示例6: test_xmlns_large_document_xml_corner_case
def test_xmlns_large_document_xml_corner_case(self):
# Test for a super weird edge case in Tidy that can cause it to return
# the wrong required buffer size.
body = '<span><span>A</span></span>' + 'A' * 7937
html = '<html xmlns="http://www.w3.org/1999/xhtml">' + body
doc, err = tidy_document(html, {'output-xml': 1})
self.assertEqual(doc.strip()[-7:], "</html>")
开发者ID:GertBurger,项目名称:pytidylib,代码行数:7,代码来源:test_docs.py
示例7: nofoutofplacefeatures
def nofoutofplacefeatures(url):
try:
# pdb.set_trace()
if url[:4]=="http":
r = requests.get(url)
else:
url="http://"+url
r = requests.get(url)
#r = requests.get(url)
data = r.text
data2=r.content
document, errors = tidy_document(data,
options={'numeric-entities':1})
#print document
#print errors
#print "Number of Elements Out of Place : " + str(len(errors))
return len(errors)
except:
pass
开发者ID:BelloHe,项目名称:Malicious_Website_Detection,代码行数:25,代码来源:realtestmodel.py
示例8: convert_to_html
def convert_to_html(filename):
# Do the conversion with pandoc
output = pypandoc.convert(filename, 'html')
# Clean up with tidy...
output, errors = tidy_document(output, options={
'numeric-entities': 1,
'wrap': 80,
})
print(errors)
# replace smart quotes.
output = output.replace(u"\u2018", '‘').replace(u"\u2019", '’')
output = output.replace(u"\u201c", "“").replace(u"\u201d", "”")
# write the output
filename, ext = os.path.splitext(filename)
filename = "{0}.html".format(filename)
with open(filename, 'w') as f:
# Python 2 "fix". If this isn't a string, encode it.
if type(output) is not str:
output = output.encode('utf-8')
f.write(output)
print("Done! Output written to: {}\n".format(filename))
开发者ID:bradmontgomery,项目名称:word2html,代码行数:25,代码来源:main.py
示例9: html2enml
def html2enml(html):
# doc, err = tidy_fragment(
doc, err = tidy_document(
html,
options={
"output-xhtml": 1,
"drop-proprietary-attributes": 1,
"merge-divs": 1,
"clean": 1
}
)
root = fromstring(doc)
# XXX dirty hack to circumvent a bug in lxml parser
root = fromstring(etree.tostring(root))
logging.debug(etree.tostring(root))
# tidy_document returns a valid html document which means it usually contains html tag and proper body element
root = root.find('body')
if root is None:
logging.warn("No body on this document")
logging.warn(html)
return "<div></div>"
root.tag = 'div'
root = remove_prohibited_elements(root)
root = remove_prohibited_attributes(root)
#FIXME Skipping dtd validation because of slow DTD creation speed
# validate_dtd(html, f):
return etree.tostring(root)
开发者ID:shurain,项目名称:archiver,代码行数:34,代码来源:enml.py
示例10: scrape
def scrape(slug, url, name, title=None):
f = urlopen(url)
doc = f.read()
doc, errs = tidy_document(
doc,
options={
"output-html": 1,
#'indent':1,
"clean": 1,
"drop-font-tags": 1,
},
)
if errs:
# raise Exception, errs
print errs
doc = html5lib.parse(doc, treebuilder="lxml") # this didn't work, but above three lines did: encoding='utf-8',
html.xhtml_to_html(doc)
jQuery = PyQuery([doc])
td = jQuery("td#content")
assert len(td) == 1
for img in td("img"):
# print 'img:', PyQuery (img)
img = PyQuery(img)
src = img.attr("src")
# alt = img.attr('alt')
# if src.startswith ('/image'):
rslt = getimage(src, slug.split("/")[0])
img.attr("src", rslt)
if trace:
print rslt
# td =
# no_fonts (td)
# need to fix links here
content = PyQuery(td[0])
# content = content.html()
content = no_namespaces(content.html())
print slug, content[:60] # .html() # [:60]
if dbteeth:
# q, created = QuickPage.objects.get_or_create (
qp, created = create_or_update(
QuickPage,
keys=dict(slug=slug),
fields=dict(
name=name,
title=title if title else name,
content=content,
# defaults = dict (sortorder = sortorder),
),
)
开发者ID:satyadevi-nyros,项目名称:eracks,代码行数:60,代码来源:scrape_pages.py
示例11: __trading_years
def __trading_years(self, instrument):
re = urllib2.urlopen('http://vip.stock.finance.sina.com.cn/corp/go.php/vMS_MarketHistory/stockid/%s.phtml' % (instrument))
document, errors = tidy_document(re.read())
soup = BeautifulSoup(document)
node = soup.find('select', attrs={'name':'year'})
for option in node.findAll('option'):
yield option.getText()
开发者ID:B-Rich,项目名称:dealer,代码行数:7,代码来源:sinads.py
示例12: process_response
def process_response(self, request, response):
if 'text/html' in response['Content-Type'] and response.content:
document, errors = tidy_document(response.content)
if errors:
raise HTMLValidationError(errors)
return response
开发者ID:nosamanuel,项目名称:django-test-validator,代码行数:7,代码来源:middleware.py
示例13: _tidysrc
def _tidysrc(self,data,srccode):
"""tidy scribe the html src"""
try:
from tidylib import tidy_document
BASE_OPTIONS = {
"output-xhtml": 1, # XHTML instead of HTML4
"indent": 1, # Pretty; not too much of a performance hit
"tidy-mark": 0, # No tidy meta tag in output
"wrap": 0, # No wrapping
"alt-text": "", # Help ensure validation
"doctype": 'strict', # Little sense in transitional for tool-generated markup...
"force-output": 1, # May not get what you expect but you will get something
"char-encoding":'utf-8',
"input-encoding":srccode,
"output-encoding":'utf-8',
}
if not isinstance(data, unicode):
try:
data = data.decode(srccode)
except:
pass
doc, errors = tidy_document(data,options={'numeric-entities':1})
return doc
except:
return data
开发者ID:adam139,项目名称:xsgs.theme,代码行数:26,代码来源:homepage.py
示例14: dynamic_test_method
def dynamic_test_method(self):
"""this function name doesn't matter much, it can start with `test`,
but we're going to rename it dynamically below"""
reportURLstring = '/report?reportname=' + reportItem.metadata['action']
response=self._my_app.get(reportURLstring)
code, error=tidylib.tidy_document(response.body, options={'show-errors':1, 'show-warnings':0})
self.assertFalse(error, '%s did not return valid html page' % reportURLstring)
开发者ID:woodenshoe,项目名称:infoshopkeeper,代码行数:7,代码来源:test_unittest.py
示例15: getMenu
def getMenu():
storeFile = open("list.txt","r")
txt = storeFile.read()
storeFile.close()
list=txt.split('\n\n\n')
# print list
for store in list:
# print store
rest = store.split('\n')
if len(rest)!=3:
break
try:
url=baseUrl+rest[2] +'menu'
print url
res=urlopen(url)
html=res.read()
options = {'output-encoding':'utf8', 'output-xhtml':1 }
document,errors = tidy_document(html,options)
filepath = dataDir+ (rest[2].split('/'))[2] + ".html"
saveFile = open(filepath,"w")
saveFile.write(document)
saveFile.close()
print filepath
except :
print "skip:"+url
开发者ID:blueskywalker,项目名称:menuReview,代码行数:31,代码来源:getMenu.py
示例16: get_employees
def get_employees(lastname, firstname):
payload = { 'find' : lastname }
res = requests.get('https://www.campus.rwth-aachen.de/rwth/all/lecturerlist.asp', params=payload)
if res.status_code == 200:
persons = [ ]
document, errors = tidy_document(res.content, options={'numeric-entities': 1, 'output_xhtml': 1})
tree = ET.fromstring(strip_ns(document))
try:
filename = posixpath.basename(urlparse.urlsplit(res.url).path)
if filename == 'lecturer.asp':
fullname = tree.find('body/table[1]/tr[3]//tr[2]/td[2]').text.strip()
unit = tree.find("body/table[2]//td[@class='h3']/a").text.strip()
persons.append(fullname)
elif filename == 'lecturerlist.asp':
links = [ ]
for cell in tree.findall('body/table[2]//td[3]/table[2]//td[1]/a'):
if cell is not None:
fullname = cell.text.strip()
persons.append(fullname)
else:
raise Exception
except:
print "===> WARNING: failed to get employee list for: %s, %s" % (firstname, lastname)
return persons
开发者ID:BenediktAllendorf,项目名称:snippets,代码行数:29,代码来源:sciebo_shares.py
示例17: sanitize
def sanitize(note):
debug('Sanitizing note content...', 2)
if get_setting('evernote/sanitize/@applytemplate') == 'True':
with open(get_setting('evernote/sanitize/template/text()'), 'r') as file:
template = file.read()
template = template.replace('{content}', note['content'])
note['content'] = transform(template)
preservedElements = []
preservePattern = get_setting('evernote/sanitize/preserve/pattern/text()')
preserves = get_setting('evernote/sanitize/preserve/elements/text()').split(',')
for preserve in preserves:
matches = re.findall(preservePattern.format(preserve), note['content'])
for match in matches:
placeholder = '{%s}' % uuid.uuid4().hex
preservedElements.append({'placeholder': placeholder, 'element': match})
note['content'] = note['content'].replace(match, placeholder, 1)
note['content'] = re.sub(get_setting('evernote/sanitize/attributes/empty/text()'), '', note['content'])
note['content'] = re.sub(get_setting('evernote/sanitize/attributes/prohibited/text()'), '', note['content'])
note['content'] = re.sub(get_setting('evernote/sanitize/elements/text()'), '', note['content'])
note['content'] = note['content'].encode('utf-8', errors='ignore')
(note['content'], errors) = tidy_document(note['content'])
for element in preservedElements:
note['content'] = note['content'].replace(element['placeholder'], element['element'])
if note['title'] != None:
note['title'] = note['title'].replace('\n', ' ').replace('\r', '').replace(' ', ' ')
else:
note['title'] = get_setting('evernote/sanitize/defaulttitle/text()')
开发者ID:arychj,项目名称:Notilitus,代码行数:33,代码来源:notilitus.py
示例18: cleanUpHTML
def cleanUpHTML(html, options=None):
import tidylib
tidylib.BASE_OPTIONS = {}
default_options = {
"force-output" : 1,
"output-xhtml" : 1,
"doctype" : "strict",
"drop-empty-paras": 1,
"output-encoding" : "utf8",
"clean": 1,
"bare": 1
}
if options:
default_options.extend(options)
# first fix up footnotes so that HTMLTidy won't ditch them
soup = BeautifulSoup.BeautifulSoup(html, smartQuotesTo="html")
footnoteFixer(soup) #html)
stripEmptyParagraphs(soup)
html, errors = tidylib.tidy_document(soup.prettify(encoding=None), options=default_options)
soup = BeautifulSoup.BeautifulSoup(html, smartQuotesTo="html")
addMetaTag(soup, [('http-equiv', 'Content-type'), ('content', 'text/html; charset=utf-8')])
return soup.prettify(encoding=None), errors
开发者ID:kollivier,项目名称:brightwriter,代码行数:27,代码来源:htmlutils.py
示例19: html_clean
def html_clean(self, html):
# First we pass it through tidy
(html, errors) = tidylib.tidy_document(html,
options={
'drop-proprietary-attributes': 1,
'alt-text': '',
'hide-comments': 1,
'output-xhtml': 1,
'show-body-only': 1,
'clean': 1,
'char-encoding': 'utf8',
'show-warnings': 0,
'show-info': 0,
})
if errors:
print(("HTML tidy failed for %s!" % self.msgid))
print(errors)
return None
try:
cleaner = HTMLCleaner()
cleaner.feed(html)
return cleaner.get_text()
except Exception as e:
# Failed to parse the html, thus failed to clean it. so we must
# give up...
return None
开发者ID:mhagander,项目名称:pgarchivesweb,代码行数:27,代码来源:parser.py
示例20: test_doc_with_unclosed_tag
def test_doc_with_unclosed_tag(self):
h = "<p>hello"
expected = DOC % '''<p>
hello
</p>'''
doc, err = tidy_document(h)
self.assertEqual(doc, expected)
开发者ID:18600597055,项目名称:hue,代码行数:7,代码来源:DocsTest.py
注:本文中的tidylib.tidy_document函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论