本文整理汇总了Python中util.ElementHelper类的典型用法代码示例。如果您正苦于以下问题:Python ElementHelper类的具体用法?Python ElementHelper怎么用?Python ElementHelper使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了ElementHelper类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: get_clustered_records
def get_clustered_records(cls, doctree):
#get level_nodes_mapping
all_level_nodes = cls.bfs_tree(doctree)
root = ElementHelper.get_root(doctree)
body = ElementHelper.get_body(doctree)
#get max level and min level
upper_bound = int(ElementHelper.get_element_depth(root))+1
low_bound = int(body.get(px))+1
for level in range(low_bound, upper_bound):
level_nodes = all_level_nodes[level]
#if parent is record node, then do not consider its children
level_nodes = [node for node in level_nodes if not cls.is_node_or_ancestor_record(node)]
for j in range(1,len(level_nodes)-1):
left_node = level_nodes[j-1]
#横向比较
right_bound = min(len(level_nodes), j+5)
right_nodes = level_nodes[j:right_bound]
#纵向比较
down_nodes = right_nodes[0]
right_nodes.extend(down_nodes)
for right_node in right_nodes:
if cls.similar_check(left_node, right_node):
left_node.set(kg_record_mark,'1')
right_node.set(kg_record_mark, '1')
break
record_groups = cls.merger_sibling_record_node(doctree)
return record_groups
开发者ID:actlea,项目名称:TopicalCrawler,代码行数:35,代码来源:api3.py
示例2: similar_check
def similar_check(cls, nodeA, nodeB):
if nodeA.tag != nodeB.tag:
return False
#compare distinct nodes
dnodesA = ElementHelper.get_children(nodeA)
dnodesB = ElementHelper.get_children(nodeB)
#dA is node_levels_mapping, rA is level_nodes_mapping
dA, dB, rA, rB = {}, {}, {}, {}
for node in dnodesA:
#ignore <a> tag as distinct tag
if node.tag == 'a': continue
dA.setdefault(node.tag, []).append(int(node.get(px)))
rA.setdefault(int(node.get(px)), []).append(node.tag)
for node in dnodesB:
if node.tag == 'a': continue
dB.setdefault(node.tag, []).append(int(node.get(px)))
rB.setdefault(int(node.get(px)), []).append(node.tag)
if abs(len(dA)-len(dB))>1 or abs(len(rA)-len(rB))>1:
return False
#check distinct tag is same?
for tag in dA:
if tag not in ('em', 'b', 'br','i', 'font') and tag not in dB:
return False
sumA = sum([len(StringHelper.unique(rA[A])) for A in rA])
sumB = sum([len(StringHelper.unique(rB[B])) for B in rB])
if abs(sumA-sumB)>1:
return False
return True
开发者ID:actlea,项目名称:TopicalCrawler,代码行数:34,代码来源:api2.py
示例3: get_title_util
def get_title_util(body, title_text):
if len(title_text) < 2:
return None
#1.计算节点文本与title_text的longest commen lenght
candidate_nodes = []
for tag in TITLE_TAG:
nodes = ElementHelper.get_element_by_tag(body, tag)
if nodes is None or len(nodes)<1: continue
nodes = [node for node in nodes if is_possible_title_tag(node)]
candidate_nodes.extend(nodes)
mapping = {}
for node in candidate_nodes:
node_text = ElementHelper.element_text_content(node)
# if len(node_text)==0 or len(node_text)>len(title_text): continue #
if len(node_text)==0: continue
llength = longest_common_length(node_text, title_text)
if llength >= 1:
mapping[node] = llength
if len(mapping)==0: return None
#2.选择长度最大的作为title节点,如果存在多个最大的,选择最靠前的作为
#title节点
sorted_nodes = [node for node, _ in sorted(mapping.items(), key=lambda x:x[1], reverse=True)]
max_len = mapping[sorted_nodes[0]]
candidates = [node for node in sorted_nodes if mapping[node]==max_len]
if len(candidate_nodes)==1:
return sorted_nodes[0]
else:
candidates.sort(cmp=lambda x,y: ElementHelper.get_element_preorder_num(x)- ElementHelper.get_element_preorder_num(y), reverse=False)
return candidates[0]
开发者ID:actlea,项目名称:TopicalCrawler,代码行数:35,代码来源:api2.py
示例4: is_cluster_all_links
def is_cluster_all_links(cluster):
""" #p判断是否是链接节点的集合。1.如果该集合中所有的文本节点都是链接节点,则属于链接噪声<a> text </a>或<li><a>text</a></li>的形式
if all tags which contain links are <a> tag, then return True
For example:
<a> link </a>
OR
<li> <a> link </a> </li>
"""
all_nodes_contain_text = []
for node in cluster:
children = ElementHelper.get_children(node)
nodes_contain_text = [node for node in children if not ElementHelper.is_element_text_none(node)
and node.tag not in ('em','strong','span','i','b')]
all_nodes_contain_text.extend(nodes_contain_text)
link_nodes = [node for node in all_nodes_contain_text if node.tag=='a' or node.getparent().tag=='a']
other_nodes = [node for node in all_nodes_contain_text if node.tag!='a' and node.getparent().tag != 'a']
link_nodes_text_number = cluster_text_number(link_nodes)
other_nodes_text_number = cluster_text_number(other_nodes)
if len(other_nodes)==0 or other_nodes_text_number==0:
return True
if 1.0 *link_nodes_text_number/other_nodes_text_number>2.0:
return True
return False
开发者ID:actlea,项目名称:TopicalCrawler,代码行数:28,代码来源:api2.py
示例5: get_aricle_cetd
def get_aricle_cetd(doctree):
cetd_parse(doctree)
body = ElementHelper.get_body(doctree)
# ElementHelper.print_element(body)
CleanTreeByMark(body)
RemoveAttribute(body)
return ElementHelper.element_text_content(body)
开发者ID:actlea,项目名称:TopicalCrawler,代码行数:7,代码来源:cetd.py
示例6: print_cluster_record
def print_cluster_record(cls, clusters, doctree):
ElementHelper.print_element(doctree)
for cluster in clusters:
if len(cluster)>1:
print '===='*10
nodes = clusters[cluster]
for node in nodes:
print ElementHelper.get_xpath_by_element(node, doctree), node.get(py)
开发者ID:actlea,项目名称:TopicalCrawler,代码行数:8,代码来源:api2.py
示例7: get_headline_content_in_cleaned_body
def get_headline_content_in_cleaned_body(body):
headlin_tag = ['h1', 'h2', 'h3', 'h4']
headline_contents = [ElementHelper.element_text_content(node)
for node in ElementHelper.get_elements_by_tagnames(body, headlin_tag)
if not ElementHelper.is_element_content_none(node)]
return '\n'.join(headline_contents)
开发者ID:actlea,项目名称:TopicalCrawler,代码行数:8,代码来源:api2.py
示例8: CleanTreeByMark
def CleanTreeByMark(element):
mark = long(element.get(kg_mark))
if 0==mark:
ElementHelper.remove_element(element)
elif 1==mark:
return
else:
for child in element:
CleanTreeByMark(child)
开发者ID:actlea,项目名称:TopicalCrawler,代码行数:9,代码来源:cetd.py
示例9: get_clustered_records
def get_clustered_records(cls, doctree):
#get level_nodes_mapping
all_level_nodes = cls.bfs_tree(doctree)
root = ElementHelper.get_root(doctree)
body = ElementHelper.get_body(doctree)
#get max level and min level
upper_bound = int(ElementHelper.get_element_depth(root))+1
low_bound = int(body.get(px))+1
#记录相似的节点
cluster={}
for level in range(low_bound, upper_bound):
level_nodes = all_level_nodes[level]
#if parent is record node, then do not consider its children
level_nodes = [node for node in level_nodes if not cls.is_node_or_ancestor_record(node)]
#在同一个父亲节点下进行比较
# tag_names = set([node.getparent() for node in level_nodes])
# tmp = {}
# for tag in tag_names:
# for node in level_nodes:
# tmp.setdefault(tag, []).append(node)
tmp = cls.segement(level_nodes)
for k, nodes in tmp.items():
# if len(nodes)==1:break
first = None
node_set = set()
for i in range(1,len(nodes)):
if nodes[i].get(kg_record_mark)=='1':
continue
left_node = nodes[i-1]
# 和集合类的所有元素比较,查看是否有相同的
right_nodes=nodes[i:]
for node in right_nodes:
if cls.similar_check(left_node, node):
if first is None:
first = left_node
node_set.add(nodes[i-1])
left_node.set(kg_record_mark, '1')
node.set(kg_record_mark, '1')
node_set.add(node)
if first is not None:
cluster[first]=node_set
record_groups = cls.merger_sibling_record_node(doctree, cluster)
# record_groups = cluster
record_groups = {k:v for k,v in record_groups.items() if k.get(kg_record_mark)=='1'}
return record_groups
开发者ID:actlea,项目名称:TopicalCrawler,代码行数:54,代码来源:api.py
示例10: get_meta_content
def get_meta_content(doctree, metaAttrName, value):
"""Extract a given meta content form document.
Example metaNames:
(name, description)
(name, keyword)
(property, og:type)
"""
meta = ElementHelper.get_element_by_tag_attr(doctree, 'meta',metaAttrName, value)
content = None
if meta is not None and len(meta)>0:
content = ElementHelper.get_attribute(meta[0], 'content')
if content is not None:
return normalize_word(content)
return ''
开发者ID:actlea,项目名称:TopicalCrawler,代码行数:14,代码来源:api2.py
示例11: get_clustered_records
def get_clustered_records(cls, doctree):
#get level_nodes_mapping
all_level_nodes = cls.bfs_tree(doctree)
root = ElementHelper.get_root(doctree)
body = ElementHelper.get_body(doctree)
#get max level and min level
upper_bound = int(ElementHelper.get_element_depth(root))+1
low_bound = int(body.get(px))+1
for level in range(low_bound, upper_bound):
level_nodes = all_level_nodes[level]
try:
next_level_nodes = all_level_nodes[level+1]
except KeyError:
next_level_nodes=None
#if parent is record node, then do not consider its children
level_nodes = [node for node in level_nodes if not cls.is_node_or_ancestor_record(node)]
for j in range(1,len(level_nodes)-1):
left_node = level_nodes[j-1]
#将横向名称相同的节点放到一起进行比较
# right_bound = min(len(level_nodes), j+5)
# right_nodes = level_nodes[j:right_bound]
# #纵向比较
# down_nodes = right_nodes[0]
# right_nodes.extend(down_nodes)
right_nodes = [node for node in level_nodes[j:] if node.tag==left_node.tag]
#纵向查找
# if next_level_nodes is not None:
# for node in next_level_nodes:
# if node.tag==left_node.tag:
# right_node.append(node)
for right_node in right_nodes:
if cls.similar_check(left_node, right_node):
left_node.set(kg_record_mark,'1')
right_node.set(kg_record_mark, '1')
break
record_groups = cls.merger_sibling_record_node(doctree)
return record_groups
开发者ID:actlea,项目名称:TopicalCrawler,代码行数:49,代码来源:api2.py
示例12: merger_sibling_record_node
def merger_sibling_record_node(cls, doctree, cluster):
''' 融合数据记录
1.首先对数据记录进行修正,然后将连续的数据记录放入到一个集合中
将同层次相同标签的节点的节点放入一个集合中,然后在就行纠正,具体详见correct_record_mark
:param doctree: 经过了初步的相似度比较之后标记了的DOM树
:param cluster: 初步的相似的数据记录的集合
:return:
'''
node_record_mapping = {}
body = ElementHelper.get_body(doctree)
thislevel = []
thislevel.extend(body)
# while thislevel:
# nextlevel = list()
# for node in thislevel:
# # correct nodes which
# cls.correct_record_mark(node)
#
# if cls.is_node_or_ancestor_record(node):
# first_record_sibling = cls.find_first_sibling_record_node(node, doctree)
# node_record_mapping.setdefault(first_record_sibling, []).append(node)
#ToDo 2016-04-20
while thislevel:
nextlevel = list()
cls.correct_record_mark(thislevel, cluster)
for node in thislevel:
if len(node) > 0:
nextlevel.extend([child for child in node if not cls.is_node_or_ancestor_record(node)])
thislevel = nextlevel
return cluster
开发者ID:actlea,项目名称:TopicalCrawler,代码行数:32,代码来源:api.py
示例13: collect_urls
def collect_urls(html, base_url, encoding=None):
""" only collect url
:param html: page string
:param base_url:
:param encoding:
:return: list of url
"""
h = HtmlHelper()
doctree = h.create_doc(html, encoding)
a_tags = ElementHelper.get_elements_by_tagnames(doctree, 'a')
for a in a_tags:
link = a.get('href',None)
link = m_strip(link)
if link is None or len(link)<2:continue
if link[0]=='#': continue #link to itself
link = normalize_url(link, base_url)
#if url in non visited set
if is_url_visited(link, unvisited_url_set):
continue
if not should_collect_url(link, base_url):
continue
#if url not in same domain
yield link
开发者ID:actlea,项目名称:TopicalCrawler,代码行数:26,代码来源:url.py
示例14: find_first_sibling_record_node
def find_first_sibling_record_node(cls, element, doctree):
'''找到element所在区域的起始节点
1.首选查看element的xpath下表,如果其下表<2,表示element左边没有兄弟节点了,直接返回element
2.如果element的xpath下标大于=2,表示element左边有兄弟节点,那么先找到element的父亲(以便于访问element的兄弟节点,然后index=设置为element的下表-2
查看parentt[index]是否是数据区域,如果是继续项左寻找,否则返回parent[index+1]
'''
parent = element.getparent()
if len(parent)<2:
return element
element_xpath = ElementHelper.get_xpath_by_element(element, doctree)
# print 'xpath: %s' %element_xpath
element_last_index = StringHelper.get_digits(element_xpath.split('/')[-1])
if element_last_index < 2:
return element
index = element_last_index - 2
# print 'parent length:%d' %len(parent)
while index >= 0:
# print index
if parent[index].get(kg_record_mark) == '1':
index -= 1
else:
break
return parent[index+1]
开发者ID:actlea,项目名称:TopicalCrawler,代码行数:26,代码来源:api2.py
示例15: is_node_or_children_record
def is_node_or_children_record(cls, element):
children = ElementHelper.get_children(element)
marks = [child.get(kg_record_mark) for child in children]
unique_marks = StringHelper.unique(marks)
if len(unique_marks)==2:
return True
return False
开发者ID:actlea,项目名称:TopicalCrawler,代码行数:8,代码来源:api2.py
示例16: is_possible_title_tag
def is_possible_title_tag(element):
if element.tag not in TITLE_TAG:
return False
if len(element)>1:
return False
if ElementHelper.is_element_content_none(element):
return False
return True
开发者ID:actlea,项目名称:TopicalCrawler,代码行数:8,代码来源:api2.py
示例17: pre_process_domtree
def pre_process_domtree(doctree):
if doctree is not None:
root = ElementHelper.get_root(doctree)
if is_set_attribute_valid(root):
return doctree
else:
return None
else:
return None
开发者ID:actlea,项目名称:TopicalCrawler,代码行数:9,代码来源:api2.py
示例18: is_cluster_contain_user_comments
def is_cluster_contain_user_comments(cluster):
""" identify whether element or its children contain comment content, only consider <a> tag
1.each node in cluster, at least has 3 children
2.there is at least one <a> tag has same text
"""
# can not identify
if len(cluster) < 2: return False
text_number_mapping = {}
#at least have three children contain text
for node in cluster:
children = ElementHelper.get_children(node)
link_nodes_contain_text = [n for n in children if is_link_node_with_text(n)]
non_link_nodes_contain_text = [n for n in children if is_none_link_node_with_text(n)]
if len(link_nodes_contain_text)<3: return False
if len(non_link_nodes_contain_text)<2: return False
for n in link_nodes_contain_text:
text = ElementHelper.element_text_content(n)
if text in text_number_mapping:
text_number_mapping[text] += 1
else:
text_number_mapping[text] = 1
#去除标点符号,出数字,空的文本
tmp = copy.deepcopy(text_number_mapping)
for text in tmp:
if len(text)==0 or StringHelper.is_digits(text) :
del text_number_mapping[text]
text_number = text_number_mapping.values()
# FOR TEST
# for text, number in node_text_mapping.items():
# print text,':', number
text_number_counter = collections.Counter(text_number).most_common()
for number, counter in text_number_counter:
if number > 1 and number==len(cluster) and counter>=2: #ToDo 2016/03/08 old:counter>=2 --> new:counter>=1
print 'find comment!'
return True
return False
开发者ID:actlea,项目名称:TopicalCrawler,代码行数:44,代码来源:api2.py
示例19: html2words
def html2words(docstring, base_url, encoding=None, supervisior=None):
"""
从网页源码中抽取正文
:param docstring:
:param encoding:
:return:
"""
string_size=sys.getsizeof(docstring)
byte_size=string_size / (1024)
if byte_size < 1:
return
docstring=docstring.lower()
doctree=HtmlHelper.create_doc(docstring, encoding)
if doctree is None: return None
copy_doc=copy.deepcopy(doctree)
# try:
#
# link_ratio=get_page_link_ratio(copy_doc)
# print 'link_ratio: %f' % link_ratio
#
# if link_ratio > 0.6:
# print 'this is home page'
# return None
# except ValueError:
# return None
doctree=HtmlHelper.pre_process_domtree(doctree)
if doctree is None:
return None
# get page title and para content
para, title=HtmlHelper.get_article(doctree, debug=False)
# get page meta keywords and meta description
meta_description=HtmlHelper.get_meta_description(copy_doc)
# get headlines in page
cleaned_body=ElementHelper.get_body(doctree)
headlines=HtmlHelper.get_headline_content_in_cleaned_body(cleaned_body)
# get all urls
url_items=[]
for item in get_link_word_by_pair(docstring, base_url, supervisior): url_items.append(item)
document=Document()
document['base_url']=base_url
document['title']=title
document['meta']=meta_description
document['headlines']=headlines
document['para']=para
document['url_items']=url_items
return document
开发者ID:actlea,项目名称:TopicalCrawler,代码行数:55,代码来源:htmlParse.py
示例20: get_link_word_by_pair
def get_link_word_by_pair(docstring, base_url, supervisior=None, encoding='utf-8'):
""" collect urls from
:param html:
:param base_url:
:return:
"""
h = HtmlHelper()
doctree = h.create_doc(docstring, encoding)
if isinstance(base_url, unicode):
base_url = base_url.encode('utf-8')
a_tags = ElementHelper.get_elements_by_tagnames(doctree, 'a')
for a in a_tags:
link = a.get('href',None)
link = a.get('href',None)
link = m_strip(link)
if link is None or len(link)<2:continue
if link[0]=='#': continue #link to itself
link = normalize_url(link, base_url)
#if url in non visited set
if is_url_visited(link, unvisited_url_set):
continue
# if not should_collect_url(link, base_url):
# continue
link_item = UrlItem()
link_item['parent_url'] = base_url
link_item['url'] = link
link_item['anchor_text'] = ElementHelper.element_text_content(a).encode('utf-8')
link_item['neigb_text'] = ''
if supervisior is not None:
link_item['label'], link_item['interestness'] = supervisior.predict(link_item['anchor_text'])
else:
link_item['label'], link_item['interestness'] = '1', 0.0 #1为负样本
yield link_item
开发者ID:actlea,项目名称:TopicalCrawler,代码行数:42,代码来源:url.py
注:本文中的util.ElementHelper类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论