本文整理汇总了Python中utils.utils.stripHtml函数的典型用法代码示例。如果您正苦于以下问题:Python stripHtml函数的具体用法?Python stripHtml怎么用?Python stripHtml使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了stripHtml函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: __getThreadPage
def __getThreadPage( self ):
"""
It will fetch each thread and its associate infomarmation
and add the tasks
"""
threads = [x.findParent('tr') for x in self.soup.find('table',id=re.compile('ViewAllThread')).findAll('a','ForumPostHead')]
for thread in threads:
try:
thread_info = thread.findAll('td',recursive=False)
if not len(thread_info)==6:
log.info(self.log_msg('Not enough fiels'))
continue
last_post_info = [x.strip() for x in stripHtml(thread_info[-1].renderContents()).split('\n')]
thread_time = datetime.strptime( last_post_info[0],'%m/%d/%Y %I:%M:%S %p by')
#page['edate_thread_last_post_date'] = datetime.strftime(thread_time,"%Y-%m-%dT%H:%M:%SZ")
self.last_timestamp = max(thread_time , self.last_timestamp )
except:
log.exception(self.log_msg('Todays Post, so ignoring'))
continue
if self.total_posts_count > self.max_posts_count:
log.info(self.log_msg('Reaching maximum post,Return false'))
return False
self.total_posts_count = self.total_posts_count + 1
try:
if checkSessionInfo('Search',self.session_info_out, thread_time,\
self.task.instance_data.get('update')) and \
self.max_posts_count >= self.total_posts_count:
continue
temp_task=self.task.clone()
try:
title_tag = thread_info[1].find('a','ForumPostHead')
temp_task.pagedata['title']= stripHtml(title_tag.renderContents())
temp_task.instance_data[ 'uri' ] = title_tag['href']
log.info(stripHtml(title_tag.renderContents()))
except:
log.info(self.log_msg('Cannot find the uri'))
continue
try:
temp_task.pagedata['et_author_name'] = stripHtml(thread_info[2].renderContents())
except:
log.info(self.log_msg('Cannot find author name'))
try:
temp_task.pagedata['et_thread_last_post_author'] = last_post_info[-1]
except:
log.info(self.log_msg('Cannot find the replies count'))
try:
view_reply = {'ei_thread_replies_count':3,'ei_thread_views_count':4}
for each in view_reply.keys():
temp_task.pagedata[each] = int(stripHtml(thread_info[view_reply[each]].renderContents()))
except:
log.info(self.log_msg('Cannot find the views count'))
try:
temp_task.pagedata['edate_last_post_date']= datetime.strftime(thread_time,"%Y-%m-%dT%H:%M:%SZ")
except:
log.info(self.log_msg('Cannot find the last posted'))
self.linksOut.append( temp_task )
log.info(self.log_msg('Task Added'))
except:
log.info(self.log_msg('Cannot add the Task'))
return True
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:60,代码来源:csharpcornerconnector.py
示例2: __getData
def __getData(self, post):
page = {}
try:
post_tag = BeautifulSoup(post.__str__().replace('/>>','/>'))
table_tag = post_tag.find('table')
if table_tag:
table_tag.extract()
try:
page['data'] = stripHtml(post_tag.renderContents())
page['title']= ''
except:
log.exception(self.log_msg('Data not found for the url %s'%self.currenturi))
return
try:
date_str = stripHtml(table_tag.findAll('strong')[-1].renderContents())
page['posted_date'] = datetime.strftime(datetime.\
strptime(re.sub("(\d+)(st|nd|rd|th)",r"\1",date_str).\
strip(),"%d %B %Y"),"%Y-%m-%dT%H:%M:%SZ")
except:
log.exception(self.log_msg('Posted date not found'))
page['posted_date'] = datetime.strftime(datetime.utcnow(), "%Y-%m-%dT%H:%M:%SZ")
try:
page['et_author_name'] = stripHtml(table_tag.findAll('strong')[0].renderContents())
except:
log.exception(self.log_msg('author name not found'))
except:
log.exception(self.log_msg('post tag not found'))
return page
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:30,代码来源:bankguideconnector.py
示例3: __getData
def __getData(self, post):
""" This will return the page dictionry
"""
page = {}
try:
date_str = date_str = stripHtml(post.find('a',href = re.compile('comment\d+')).\
renderContents()).replace('PST','').replace('PDT','').\
replace('at','').strip()
page['posted_date']= datetime.strptime(date_str,"%d %b %Y %I:%M %p").\
strftime("%Y-%m-%dT%H:%M:%SZ")
except:
log.exception(self.log_msg('posted_date not be found in %s'% self.currenturi))
#log.info(date_str)
page['posted_date'] = datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ")
try:
page['data'] = stripHtml(post.find('div','single_comment',id = re.compile('comment\d+_show')).\
renderContents())
page['title'] = ''
except:
log.info(self.log_msg('post not found in %s'% self.currenturi))
return
try:
page['et_author_name'] = stripHtml(post.find('a').renderContents())
except:
log.exception(self.log_msg('Author name not found'))
return page
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:26,代码来源:gamasutraconnector.py
示例4: __getData
def __getData(self, post, is_question):
""" This will return the page dictionry
"""
page = {'entity':'question' if is_question else 'answer'}
try:
data_tag = post.find('div', id=re.compile('post_message_\d+'))
[each.findParent('div').extract() for each in data_tag.findAll('div', text='Quote:')]
page['data'] = stripHtml(data_tag.renderContents())
page['title'] = stripHtml(self.soup.find('td', 'navbar').renderContents())
if not is_question:
page['title'] = 'Re:' + page['title']
except:
log.exception(self.log_msg('Data not found'))
page['data'] = ''
if not page['data']:
log.info(self.log_msg("Data is not found for discarding this Post"))
return False
try:
page['et_author_name'] = stripHtml(post.find('a', 'bigusername').renderContents())
except:
log.info(self.log_msg('author name not found'))
try:
date_str = stripHtml(post.find('td', 'thead').renderContents())
date_str = re.sub("(\d+)(st|nd|rd|th)",r"\1", date_str).strip()
page['posted_date'] = datetime.strftime(datetime.strptime(date_str,'%B %d, %Y, %I:%M %p'), "%Y-%m-%dT%H:%M:%SZ")
except:
log.exception(self.log_msg('posted date not found'))
page['posted_date'] = datetime.strftime(datetime.utcnow(), "%Y-%m-%dT%H:%M:%SZ")
return page
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:30,代码来源:htcpediaconnector.py
示例5: __getData
def __getData(self, post):
""" This will return the page dictionry
"""
page = {'entity':'answer', 'uri':self.currenturi, 'title': 'Re: ' + self.__thread_topic, 'et_thread_topic':self.__thread_topic}
try:
page['data'] = stripHtml(post.find('div', 'post_fmt').renderContents())
except:
log.info(self.log_msg('Data not found for the url %s'%self.currenturi))
return True
try:
author_tag_str = stripHtml(post.find('div', 'post_hdr_fmt').renderContents())
if 'responded:' in author_tag_str:
page['et_author_name'] = author_tag_str.replace('responded:', '').strip()
else:
author_split = author_tag_str.split('replied to')
page['et_author_name'] = author_split[0].strip()
page['et_data_replied_to'] = author_split [1].split(" 's ")[0].strip()
except:
log.info(self.log_msg('Authors info not avbl'))
try:
date_str = stripHtml(post.find('div', 'posted_fmt').renderContents()).split('GMT')[0].strip().replace("document.write(DateDelta('", '').strip()
page['posted_date'] = datetime.strftime(datetime.strptime(date_str, '%a %B %d %Y %H:%M:%S'), "%Y-%m-%dT%H:%M:%SZ")
except:
log.info(self.log_msg('posted_date not found in url %s'%self.currenturi))
return page
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:25,代码来源:webmdexchangesconnector.py
示例6: __getData
def __getData(self, post):
""" This will return the page dictionry
"""
page = {'entity':'question' if self.__is_question else 'answer'}
try:
data_tag = post.find('span', 'postbody')
[each.extract() for each in data_tag.findAll('div', 'quote_container')]
page['data'] = stripHtml(data_tag.renderContents())
page['title'] = stripHtml(self.soup.find('a','maintitle').renderContents())
if not self.__is_question:
page['title'] = 'Re:' + page['title']
except:
log.exception(self.log_msg('Data not found'))
page['data'] = ''
if not page['data']:
log.info(self.log_msg("Data is not found for discarding this Post"))
return False
try:
page['et_author_name'] = stripHtml(post.find('span', 'name').renderContents())
except:
log.info(self.log_msg('author name not found'))
try:
date_str = stripHtml(post.findAll('span', 'postdetails')[1].renderContents()).split('Post subject:')[0].replace('Posted:','').strip()
page['posted_date'] = datetime.strftime(datetime.strptime(date_str,'%a %b %d, %Y %I:%M %p'), "%Y-%m-%dT%H:%M:%SZ")
except:
log.exception(self.log_msg('posted date not found'))
page['posted_date'] = datetime.strftime(datetime.utcnow(), "%Y-%m-%dT%H:%M:%SZ")
return page
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:29,代码来源:blackberryblastforumsconnector.py
示例7: __getData
def __getData(self, review, post_type ):
""" This will return the page dictionry
"""
page = {'title':'','posted_date':datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ")}
try:
page['et_data_post_type'] = post_type
title_info = review.findAll('td')
page['title'] = stripHtml(title_info[1].renderContents())
page['posted_date'] = datetime.strftime(datetime.strptime(stripHtml\
(title_info[0].renderContents()),'%Y-%m-%d %H:%M'),\
"%Y-%m-%dT%H:%M:%SZ")
page['et_author_name'] = stripHtml(title_info[2].renderContents())
except:
log.info(self.log_msg('title or posted date not found'))
try:
td_tag = review.findNext('tr')
div_tag = td_tag.find('div')
if div_tag:
div_tag.extract()
page['data'] = '\n'.join([x for x in stripHtml(td_tag.renderContents()).split('\n') if not x.strip()=='' and not x.strip().startswith('>') and not re.match('.*wrote:$',x.strip()) and not re.search('napisa.a:$',x.strip()) and not re.search('napisa.\(a\):$',x.strip())])
except:
log.exception(self.log_msg('Posted date not found for this post'))
page['data'] = ''
try:
if page['title']=='':
if len(page['data']) > 50:
page['title'] = page['data'][:50] + '...'
else:
page['title'] = page['data']
except:
log.exception(self.log_msg('title not found'))
page['title'] = ''
return page
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:34,代码来源:moneygrupydyskusyjneconnector.py
示例8: __get_data
def __get_data(self, post, is_original_post):
page = {'entity':'question' if is_original_post else 'answer'}
page['uri'] = self.currenturi
page['posted_date'] = datetime.strftime(datetime.utcnow(), "%Y-%m-%dT%H:%M:%SZ")
auth_info = self.__get_author_info(post, is_original_post)
if auth_info['name']:
page['et_author_name'] = auth_info['name']
if auth_info['location']:
page['et_author_location'] = auth_info['location']
if is_original_post:
page['data'] = stripHtml([e.strip() for e in post.findAll(text=True) if e.strip()][3])
else:
#page['data'] = stripHtml([e.strip() for e in post.findAll(text=True) if e.strip()][1])
page['data'] = stripHtml(str(post.find('div', 'textLine12 text12 commentBodyBlock')))
if page['data'] == 'Add as friend':
log.info(self.log_msg('post data could not be found in url %s' % self.currenturi))
return False
title = stripHtml(str(self.soup.find('h1', 'text16 global-mfc pm0all textLine12')))
if not is_original_post:
page['title'] = 'Re: ' + title
else:
page['title'] = title
return page
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:28,代码来源:wellsphereconnector.py
示例9: _getUserInfo
def _getUserInfo(self,author_profile_link):
try:
user_profile_id = re.findall("http://myaccount\.ibibo\.com/MyIbibo\.aspx\?uId=(.*)$",author_profile_link)[0]
self.currenturi = "http://my.ibibo.com/Profile/view/" + user_profile_id
res=self._getHTML(self.currenturi)
self.rawpage=res['result']
self._setCurrentPage()
try:
self.current_page['ei_author_age']= str(int(stripHtml(self.soup.find('span',attrs={'id':'UserAgeSexLocationInfo'}).previous)))
except:
log.info(self.log_msg("Error occured while fetching author's age"))
try:
self.current_page['et_author_gender']= stripHtml(self.soup.find('span',attrs={'id':'UserAgeSexLocationInfo'}).findNext('span').renderContents().replace(',','').replace('\n',' ').strip().split()[0])
except:
log.info(self.log_msg("Error occured while fetching author's gender"))
try:
self.current_page['et_author_location']= ' '.join(stripHtml(self.soup.find('span',attrs={'id':'UserAgeSexLocationInfo'}).findNext('span').renderContents().replace(',','').replace('\n',' ').strip().split()[1:]))
except:
log.info(self.log_msg("Error occured while fetching author's location"))
log.debug("Fetched user info from the url %s" %author_profile_link)
return True
except:
log.exception(self.log_msg("Exception occured while fetching user profile"))
return False
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:28,代码来源:ibiboopinionsconnector.py
示例10: __getData
def __getData(self, post):
page = {}
try:
page['data'] = stripHtml(post.find('div','footer').findPrevious('p').\
renderContents())
page['title'] = ''
except:
log.exception(self.log_msg('data not found'))
return
try:
date_str = stripHtml(post.find('div','wraptocenter').renderContents()).\
split('posted')[-1].strip()
page['posted_date'] = datetime.strftime(datetime.strptime(date_str,'%b-%d-%Y'),"%Y-%m-%dT%H:%M:%SZ")
except:
log.exception(self.log_msg('posted_date nt found %s'%self.currenturi))
page['posted_date'] = datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ")
try:
page['et_author_name'] = stripHtml(post.find('div','user').find('a').renderContents())
except:
log.exception(self.log_msg('author_name not found %s'%self.currenturi))
try:
page['ef_rating_overall'] = float(stripHtml(post.find('div','rating').\
renderContents()).split('/')[0])
except:
log.exception(self.log_msg('rating tag not found'))
return page
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:27,代码来源:resellerratingsconnector.py
示例11: __getAuthorInfo
def __getAuthorInfo(self,page):
'''It will fetch the author info
'''
try:
self.currenturi = 'http://kin.naver.com/userinfo/index.php?member_id=%s'%page['et_author_name']
log.info(self.currenturi)
if not self.__setSoup():
return page
except:
log.info(self.log_msg('author url not found'))
return page
try:
aut_info = [int(re.sub('[^\d]','',stripHtml(x.findNext('dd').renderContents()))) for x in self.soup.find('dl','info_count').findAll('dt')]
page['ei_author_questions_count'] =aut_info[0]
page['ei_author_answers_count'] =aut_info[1]
page['ei_author_referals_count'] =aut_info[2]
except:
log.info(self.log_msg('author info count not found'))
try:
aut_info = [float(stripHtml(x.findNext('dd').renderContents()[:-1])) for x in self.soup.find('dl','info_graph').findAll('dd','graph')]
page['ef_author_questioning_percentage'] = aut_info[0]
page['ef_author_answering_percentage'] = aut_info[1]
page['ef_author_writing_percentage'] = aut_info[2]
except:
log.info(self.log_msg('Author info not found , float '))
try:
aut_info = [stripHtml(x.renderContents()) for x in self.soup.find('dl','info_rank').findAll('dd')]
page['ei_author_energy'] =int( re.sub('[^\d]','',aut_info[0]))
page['ei_author_rank'] =int(re.sub('[^\d]','',aut_info[1]))
except:
log.info(self.log_msg('rank not found'))
return page
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:32,代码来源:naverconnector.py
示例12: __getData
def __getData(self, review, post_type ):
""" This will return the page dictionry
"""
page = {'et_data_post_type':post_type}
## try:
## unique_kye = review.find('div','dt').findPrevious('a')['name']
## except:
## log.info(self.log_msg('unique not found'))
## return False
try:
post_info = review.find('div','postleft')
author_info = post_info.find('dt')
page['et_author_name'] = stripHtml(author_info.renderContents())
profile = author_info.find('a',href=True)
if profile:
page['et_author_profile'] = self.base_url + profile['href']
page['et_author_title'] = stripHtml(post_info.find('dd','usertitle').renderContents())
aut_info = ['Zarejestrowany:','Posty:']
for each in aut_info:
info_str = post_info.find('dd',text= re.compile( each+'.*'))#
if info_str.startswith(aut_info[0]):
date_str = info_str.replace(aut_info[0],'').strip()
page['edate_author_member_since'] = datetime.strftime(datetime.strptime(date_str, '%Y-%m-%d'),"%Y-%m-%dT%H:%M:%SZ")
if info_str.startswith(aut_info[1]):
page['ei_author_posts_count'] = int(info_str.replace(aut_info[1],'').strip())
except:
log.info(self.log_msg('post info not found'))
prev_soup = copy.copy(self.soup)
prev_uri = self.currenturi
try:
self.currenturi = page['et_author_profile']
if self.__setSoup():
author_stat = [int(stripHtml(x.find('span').renderContents())) for x in self.soup.find('div',id='column_center').findAll('p')[1:]]
page['ei_author_opinions_count'] = author_stat[0]
page['ei_author_comments_count'] = author_stat[1]
page['ei_author_rating'] = author_stat[3]
except:
log.info(self.log_msg('Author info not found'))
self.soup =copy.copy(prev_soup)
self.currenturi = prev_uri
try:
date_str = stripHtml(review.find('h2').find('a').renderContents())
page['posted_date'] = datetime.strftime(datetime.strptime(date_str,'%Y-%m-%d %H:%M:%S'),"%Y-%m-%dT%H:%M:%SZ")
except:
log.info(self.log_msg('Posted date not found'))
page['posted_date'] = datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ")
try:
page['data'] = stripHtml(review.find('div','postmsg').renderContents())
except:
log.exception(self.log_msg('Posted date not found for this post'))
page['data'] = ''
try:
if len(page['data']) > 50:
page['title'] = page['data'][:50] + '...'
else:
page['title'] = page['data']
except:
log.exception(self.log_msg('title not found'))
page['title'] = ''
return page
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:60,代码来源:oceanconnector.py
示例13: __getThreads
def __getThreads( self ):
""" Get thread information and create tasks.
"""
threads = [each.findParent('tr') for each in self.soup.findAll('a',
'SUBJECT_STYLE')]
if not threads:
log.info(self.log_msg('No Results in url %s'%self.currenturi))
return False
for thread in threads:
if self.__total_threads_count > self.__max_threads_count:
log.info(self.log_msg('Reaching maximum post,Return false'))
return False
self.__total_threads_count += 1
thread_info = thread.findAll('td', recursive=False)
try:
date_str = re.sub('\s+', ' ', stripHtml(thread_info[-1].\
renderContents()))
thread_time = datetime.strptime(date_str, "%m-%d-%Y %I:%M %p")
except Exception, exce:
log.info(self.log_msg('Posted date not found in url %s'\
%self.currenturi))
continue
if checkSessionInfo('Search', self.session_info_out, thread_time,
self.task.instance_data.get('update')):
log.info(self.log_msg('Session info return True'))
return False
self.__last_timestamp = max(thread_time, self.__last_timestamp)
temp_task = self.task.clone()
try:
temp_task.pagedata['title'] = stripHtml(thread.find('a', \
'SUBJECT_STYLE').renderContents().strip()).split(':')\
[-1].strip()
except Exception, exce:
log.info(self.log_msg('Thread title not available'))
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:34,代码来源:ficoforumsconnector.py
示例14: __getData
def __getData(self, review):
'''This will get review Div tag and return a Dictionary if all fields
captured, if no data found, it will return False'''
page = {'title':self.__title_str} # Title Changed
author_tag = review.find('a', 'avatar_link', href=True)
if author_tag:
page['et_author_name'] = stripHtml(author_tag.renderContents())
page['et_author_profile'] = author_tag['href']
try:
date_str = stripHtml(review.find('a', 'avatar_time').renderContents())
date_obj = datetime.strptime(date_str,'%m/%d/%y')
except:
log.info(self.log_msg('posted date cannot be parsed in url %s'%self.currenturi))
date_obj = datetime.utcnow()
page['posted_date'] = datetime.strftime(date_obj,"%Y-%m-%dT%H:%M:%SZ")
try:
data_str = stripHtml(review.find('span' ,'ctedit').renderContents())
reply_author_match = re.search('@\s*.+?:', data_str)
if reply_author_match:
author_name = reply_author_match.group() # Variable Fixed
page['et_data_replied_author'] = author_name[1:-1].strip()
data_str = data_str.replace(author_name,'',1).strip()
page['data'] = data_str
except:
log.info(self.log_msg('Data not found in url %s'%self.currenturi)) # Url Fixed
page['data'] = ''
if not page['data']:
log.info(self.log_msg('Empty data is found for url %s'%self.currenturi)) # URL Fixed
return False
return page
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:30,代码来源:gizmodoconnector.py
示例15: _getParentPage
def _getParentPage(self,parent_uri):#NAMING CONVENTION IS WRONG
##J- I think these needs to be in a try except- if th title fails or rating fails - coz the html changed---what crash?
## a try-except-raise
try:
page={}
try:
page['data']= re.sub('\n{2,}','\n',stripHtml(self.soup.find('dd',{'class':'rContent'}).renderContents()))
except:
log.exception(self.log_msg('data could not be parsed'))
raise e
try:
page['title'] = stripHtml(self.soup.find('strong',{'id':'q_title'}).renderContents())
except Exception, e:
log.exception(self.log_msg('could not parse page title'))
raise e
try:
page['et_author_name'] = stripHtml(self.soup.find('p',{'class':'nickArea'}).a.renderContents())
except:
log.info('could not parse author name')
try:
page['ei_num_views'] = int(self.soup.find('span',{'id':'viewCount'}).renderContents())
except Exception, e:
log.info(self.log_msg('could not parse number of views'))
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:26,代码来源:daumknowledgeconnector.py
示例16: __getData
def __getData(self, review, post_type ):
""" This will return the page dictionry
"""
page = {'et_data_post_type':post_type}
try:
page['title'] = stripHtml(review.find('h3').renderContents())
except:
log.info(self.log_msg('title not found'))
page['title'] =''
try:
post_info = review.find('p','author')
aut_info = post_info.find('strong')
page['et_author_name'] = stripHtml(aut_info.renderContents())
date_str = stripHtml(aut_info.nextSibling)
page['posted_date'] = datetime.strftime( datetime.strptime(date_str,'%Y-%m-%d, %H:%M'),"%Y-%m-%dT%H:%M:%SZ")
except:
log.info(self.log_msg('post info not found'))
page['posted_date'] = datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ")
try:
page['data'] = stripHtml(review.find('div','content').renderContents())
except:
log.exception(self.log_msg('Posted date not found for this post'))
page['data'] = ''
try:
if page['title'] =='':
if len(page['data']) > 50:
page['title'] = page['data'][:50] + '...'
else:
page['title'] = page['data']
except:
log.exception(self.log_msg('title not found'))
page['title'] = ''
return page
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:33,代码来源:bankoweforumconnector.py
示例17: __getData
def __getData(self, post):
page = {}
try:
page['data'] = stripHtml(post.find('div',{'style':'padding-top:3px; width:350px;'}).\
renderContents())
page['title'] = ''
except:
log.exception(self.log_msg('title and data not found'))
return
try:
date_str = stripHtml(post.find('div',{'style':'text-align:right;width:130px;float:right; top:-2px;left:-3px;'}).\
renderContents()).strip()
page['posted_date'] = datetime.strftime(datetime.strptime(date_str,'%B %d, %Y'),"%Y-%m-%dT%H:%M:%SZ")
except:
log.exception(self.log_msg('posted_date nt found %s'%self.currenturi))
page['posted_date'] = datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ")
try:
page['et_author_name'] = stripHtml(post.find('div',{'style':'float:left;padding-left:7px; width:120px;'}).\
renderContents())
except:
log.exception(self.log_msg('author_name not found %s'%self.currenturi))
try:
page['ef_rating_overall'] = float(len(post.findAll('img', src='images/star_darkSM.gif')))
except:
log.exception(self.log_msg('rating tag not found'))
return page
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:27,代码来源:americanapparelconnector.py
示例18: __getData
def __getData(self,review):
'''Doc String
'''
page = {}
try:
comment_head = review.find('div','comment-heading')
rating_str = stripHtml(comment_head.find('p',attrs={'class':re.compile('lr-stars')}).renderContents()).lower()
rating = re.search('\d+',rating_str).group()
if 'half' in rating_str.lower():
rating = rating + '.5'
page['ef_rating_overall']=float(rating)
except:
log.info(self.log_msg('rating overall not found'))
try:
date_str = stripHtml(comment_head.find('p','comment-date').renderContents())
page['posted_date'] = datetime.strftime(datetime.strptime (date_str,'Posted on %b %d, %Y, %I:%M%p'),"%Y-%m-%dT%H:%M:%SZ")
except:
page['posted_date'] = datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ")
log.info(self.log_msg('Posted date not found'))
try:
data_tag = review.find('p','comment')
[x.extract() for x in data_tag.findAll('a')]
page['data'] = stripHtml(data_tag.renderContents())
page['title'] = page['data'][:50] + '...'
page['uri'] = self.currenturi
except Exception, e:
log.exception(self.log_msg('could not get data'))
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:27,代码来源:gamezeboconnector.py
示例19: __getData
def __getData(self, post, is_question):
""" This will return the page dictionary."""
page = {'entity':'question' if is_question else 'answer'}
try:
self.__topic = stripHtml(self.soup.title.renderContents().split('-')[0].strip())
except:
log.info(self.log_msg('Title Not found for uri %s'%self.currenturi))
if is_question:
page['title'] = self.__topic
else:
page['title'] = 'RE: ' + self.__topic
try:
data = post.find('div', attrs={'class':'content'})
qs = data.findAll('blockquote')
for q in qs: q.extract() # Removing quotes
page['data'] = stripHtml(data.renderContents()).strip()
except:
log.info(self.log_msg('Data not found for the url %s'%self.currenturi))
page['data'] = ''
#Sometimes only Image is uploaded on the Post, in that case data will be empty
if not page['data'] and page['title']:
log.info(self.log_msg("Data and Title are not found for %s,discarding this Post"%(self.currenturi)))
return False
try:
page['posted_date'] = datetime.strftime(self.__processTime(post.find('p', attrs={'class': 'author'}).contents[-1][9:].strip()), '%Y-%m-%dT%H:%M:%SZ')
except:
log.exception(self.log_msg('Posted date not found'))
page['posted_date'] = datetime.strftime(datetime.utcnow(), "%Y-%m-%dT%H:%M:%SZ")
try:
author_tag = post.find('p', attrs={'class':'author'}).find('a', href=re.compile('member'))
page['et_author_name'] = stripHtml(author_tag.renderContents())
except:
log.info(self.log_msg('author name not found'))
try:
page['et_author_category'] = stripHtml(post.find('dl', id=re.compile(r'profile\d+')).find('dd').renderContents().strip())
except:
log.info(self.log_msg('author category not found'))
try:
page['et_author_profile'] = author_tag['href']
except:
log.info(self.log_msg('author profile not found'))
try:
page['ei_author_posts_count'] = int(post.find('strong', text='Posts:').parent.nextSibling)
except:
log.info(self.log_msg('author posts count not found'))
try:
author_date_tag = post.find('strong', text='Joined:').parent.nextSibling
page['edate_author_member_since'] = datetime.strftime(datetime\
.strptime(author_date_tag.string.strip(), '%a %b %d, %Y %I:%M %p'), '%Y-%m-%dT%H:%M:%SZ')
except:
page['edate_author_member_since'] = page['posted_date']
log.exception(self.log_msg('author registered date not found'))
if len(self.__hierarchy) >= 3:
page['et_thread_topic'] = self.__hierarchy[-1]
page['et_thread_forum'] = self.__hierarchy[-3]
page['et_thread_subforum'] = self.__hierarchy[-2]
else:
log.info(self.log_msg('Cannot find the Data thread details'))
log.info(self.log_msg(str(self.__hierarchy)))
return page
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:60,代码来源:dubaiforumsconnector.py
示例20: __getData
def __getData(self, post):
""" This will return the page dictionry
"""
page = {}
try:
user_info = post.find('div','post-user-info')
try:
date_str = stripHtml(user_info.find('abbr','timeago').renderContents()).strip()
page['posted_date']= datetime.strptime(date_str,"%d %b %Y").strftime("%Y-%m-%dT%H:%M:%SZ")
except:
log.exception(self.log_msg('posted_date not be found in %s'% self.currenturi))
page['posted_date'] = datetime.strftime(datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ")
try:
page['et_author_name'] = stripHtml(user_info.find('abbr','timeago').\
findNext('a').renderContents())
except:
log.exception(self.log_msg('auth name not found'))
except:
log.exception(self.log_msg('user_info not found in %s'% self.currenturi))
try:
page['data'] = stripHtml(post.find('p','comment-body').renderContents())
page['title'] = ''
except:
log.exception(self.log_msg('Data not found for the url %s'%self.currenturi))
return
try:
rank = BeautifulSoup(post.__str__().replace('/>>','/>').replace('<span>','</span>'))
page['et_data_rating'] = stripHtml(rank.find('span',id = re.compile('post-rank-\d+')).\
renderContents())
except:
log.exception(self.log_msg('data rating not found'))
return page
开发者ID:jsyadav,项目名称:CrawlerFramework,代码行数:32,代码来源:amplicateconnector.py
注:本文中的utils.utils.stripHtml函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论